Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9a6cdd7

Browse filesBrowse files
ggerganovmglambda
authored andcommitted
llama : refactor src/llama.cpp (ggml-org#10902)
* llama : scatter llama.cpp into multiple modules (wip) * llama : control-vector -> adapter * llama : arch * llama : mmap ggml-ci * ci : remove BUILD_SHARED_LIBS=OFF ggml-ci * llama : arch (cont) ggml-ci * llama : chat ggml-ci * llama : model ggml-ci * llama : hparams ggml-ci * llama : adapter ggml-ci * examples : fix ggml-ci * rebase ggml-ci * minor * llama : kv cache ggml-ci * llama : impl ggml-ci * llama : batch ggml-ci * cont ggml-ci * llama : context ggml-ci * minor * llama : context (cont) ggml-ci * llama : model loader ggml-ci * common : update lora ggml-ci * llama : quant ggml-ci * llama : quant (cont) ggml-ci * minor [no ci]
1 parent 0f4db31 commit 9a6cdd7
Copy full SHA for 9a6cdd7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Dismiss banner
Expand file treeCollapse file tree

61 files changed

+20419
-19875
lines changed

‎.github/workflows/build.yml

Copy file name to clipboardExpand all lines: .github/workflows/build.yml
+13-15Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ jobs:
6060
-DLLAMA_CURL=ON \
6161
-DGGML_METAL_USE_BF16=ON \
6262
-DGGML_METAL_EMBED_LIBRARY=ON \
63-
-DGGML_RPC=ON \
64-
-DBUILD_SHARED_LIBS=OFF
63+
-DGGML_RPC=ON
6564
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
6665
6766
- name: Test
@@ -123,8 +122,7 @@ jobs:
123122
-DLLAMA_FATAL_WARNINGS=ON \
124123
-DLLAMA_CURL=ON \
125124
-DGGML_METAL=OFF \
126-
-DGGML_RPC=ON \
127-
-DBUILD_SHARED_LIBS=OFF
125+
-DGGML_RPC=ON
128126
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
129127
130128
- name: Test
@@ -181,7 +179,7 @@ jobs:
181179
run: |
182180
mkdir build
183181
cd build
184-
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
182+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
185183
cmake --build . --config Release -j $(nproc)
186184
187185
- name: Test
@@ -651,23 +649,23 @@ jobs:
651649
matrix:
652650
include:
653651
- build: 'noavx-x64'
654-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
652+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
655653
- build: 'avx2-x64'
656-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
654+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
657655
- build: 'avx-x64'
658-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
656+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
659657
- build: 'avx512-x64'
660-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
658+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
661659
- build: 'openblas-x64'
662-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
660+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
663661
- build: 'kompute-x64'
664-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
662+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
665663
- build: 'vulkan-x64'
666-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
664+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
667665
- build: 'llvm-arm64'
668-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
666+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
669667
- build: 'msvc-arm64'
670-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
668+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
671669
- build: 'llvm-arm64-opencl-adreno'
672670
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
673671

@@ -914,7 +912,7 @@ jobs:
914912
shell: cmd
915913
run: |
916914
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
917-
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
915+
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
918916
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
919917
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
920918
cmake --build build --config Release

‎common/arg.cpp

Copy file name to clipboardExpand all lines: common/arg.cpp
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,15 +1512,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15121512
{"--lora"}, "FNAME",
15131513
"path to LoRA adapter (can be repeated to use multiple adapters)",
15141514
[](common_params & params, const std::string & value) {
1515-
params.lora_adapters.push_back({ std::string(value), 1.0 });
1515+
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
15161516
}
15171517
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15181518
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
15191519
add_opt(common_arg(
15201520
{"--lora-scaled"}, "FNAME", "SCALE",
15211521
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
15221522
[](common_params & params, const std::string & fname, const std::string & scale) {
1523-
params.lora_adapters.push_back({ fname, std::stof(scale) });
1523+
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
15241524
}
15251525
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15261526
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));

‎common/common.cpp

Copy file name to clipboardExpand all lines: common/common.cpp
+13-12Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -922,20 +922,21 @@ struct common_init_result common_init_from_params(common_params & params) {
922922

923923
// load and optionally apply lora adapters
924924
for (auto & la : params.lora_adapters) {
925-
common_lora_adapter_container loaded_la;
926-
loaded_la.path = la.path;
927-
loaded_la.scale = la.scale;
928-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
929-
if (loaded_la.adapter == nullptr) {
925+
llama_lora_adapter_ptr lora;
926+
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
927+
if (lora == nullptr) {
930928
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
931929
llama_free(lctx);
932930
llama_free_model(model);
933931
return iparams;
934932
}
935-
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
933+
934+
la.ptr = lora.get();
935+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
936936
}
937+
937938
if (!params.lora_init_without_apply) {
938-
common_lora_adapters_apply(lctx, iparams.lora_adapters);
939+
common_lora_adapters_apply(lctx, params.lora_adapters);
939940
}
940941

941942
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -996,17 +997,17 @@ struct common_init_result common_init_from_params(common_params & params) {
996997
llama_perf_context_reset(lctx);
997998
}
998999

999-
iparams.model = model;
1000-
iparams.context = lctx;
1000+
iparams.model.reset(model);
1001+
iparams.context.reset(lctx);
10011002

10021003
return iparams;
10031004
}
10041005

1005-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1006+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
10061007
llama_lora_adapter_clear(ctx);
1007-
for (auto & la : lora_adapters) {
1008+
for (auto & la : lora) {
10081009
if (la.scale != 0.0f) {
1009-
llama_lora_adapter_set(ctx, la.adapter, la.scale);
1010+
llama_lora_adapter_set(ctx, la.ptr, la.scale);
10101011
}
10111012
}
10121013
}

‎common/common.h

Copy file name to clipboardExpand all lines: common/common.h
+15-11Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#pragma once
44

5-
#include "llama.h"
5+
#include "llama-cpp.h"
66

77
#include <string>
88
#include <vector>
@@ -27,10 +27,8 @@
2727
struct common_lora_adapter_info {
2828
std::string path;
2929
float scale;
30-
};
3130

32-
struct common_lora_adapter_container : common_lora_adapter_info {
33-
struct llama_lora_adapter * adapter;
31+
struct llama_lora_adapter * ptr;
3432
};
3533

3634
using llama_tokens = std::vector<llama_token>;
@@ -478,10 +476,12 @@ std::string fs_get_cache_file(const std::string & filename);
478476
// Model utils
479477
//
480478

479+
// note: defines object's lifetime
481480
struct common_init_result {
482-
struct llama_model * model = nullptr;
483-
struct llama_context * context = nullptr;
484-
std::vector<common_lora_adapter_container> lora_adapters;
481+
llama_model_ptr model;
482+
llama_context_ptr context;
483+
484+
std::vector<llama_lora_adapter_ptr> lora;
485485
};
486486

487487
struct common_init_result common_init_from_params(common_params & params);
@@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
503503
const struct llama_model_params & params);
504504

505505
// clear LoRA adapters from context, then apply new list of adapters
506-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
506+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
507507

508508
//
509509
// Batch utils
@@ -640,6 +640,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
640640
// Split utils
641641
//
642642

643-
static const char * const LLM_KV_SPLIT_NO = "split.no";
644-
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
645-
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
643+
namespace {
644+
645+
const char * const LLM_KV_SPLIT_NO = "split.no";
646+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
647+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
648+
649+
}

‎examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Copy file name to clipboardExpand all lines: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+5-5Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -434,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) {
434434
}
435435
}
436436

437-
struct llama_file {
437+
struct my_llama_file {
438438
// use FILE * so we don't have to re-open the file to mmap
439439
FILE * fp;
440440
size_t size;
441441

442-
llama_file(const char * fname, const char * mode) {
442+
my_llama_file(const char * fname, const char * mode) {
443443
fp = std::fopen(fname, mode);
444444
if (fp == NULL) {
445445
size = 0;
@@ -500,15 +500,15 @@ struct llama_file {
500500
return std::string(chars.data(), len);
501501
}
502502

503-
~llama_file() {
503+
~my_llama_file() {
504504
if (fp) {
505505
std::fclose(fp);
506506
}
507507
}
508508
};
509509

510510
static bool is_ggml_file(const char * filename) {
511-
llama_file file(filename, "rb");
511+
my_llama_file file(filename, "rb");
512512
if (file.size < 4) {
513513
return false;
514514
}
@@ -576,7 +576,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
576576
} else {
577577
// assume llama2.c vocabulary
578578
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
579-
llama_file file(filename, "rb");
579+
my_llama_file file(filename, "rb");
580580
if (!file.fp) {
581581
die_fmt("%s: %s", strerror(errno), filename);
582582
}

‎examples/cvector-generator/cvector-generator.cpp

Copy file name to clipboardExpand all lines: examples/cvector-generator/cvector-generator.cpp
+3-4Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,12 +415,13 @@ int main(int argc, char ** argv) {
415415
// load the model to get hparams
416416
common_init_result llama_init = common_init_from_params(params);
417417

418-
llama_model * model = llama_init.model;
419-
llama_context * ctx = llama_init.context;
418+
llama_model * model = llama_init.model.get();
419+
llama_context * ctx = llama_init.context.get();
420420

421421
// int n_ctx = llama_n_ctx(ctx);
422422
int n_layers = llama_n_layer(model);
423423
int n_embd = llama_n_embd(model);
424+
424425
// get model hint param (a.k.a model arch name)
425426
char model_hint[128];
426427
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
@@ -474,8 +475,6 @@ int main(int argc, char ** argv) {
474475

475476
// done with the model, we can now free it to make gain some memory
476477
printf("Done evaluate prompts, unload model...\n");
477-
llama_free(ctx);
478-
llama_free_model(model);
479478

480479
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
481480

‎examples/embedding/embedding.cpp

Copy file name to clipboardExpand all lines: examples/embedding/embedding.cpp
+3-4Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,9 @@ int main(int argc, char ** argv) {
9797
// load the model
9898
common_init_result llama_init = common_init_from_params(params);
9999

100-
llama_model * model = llama_init.model;
101-
llama_context * ctx = llama_init.context;
100+
llama_model * model = llama_init.model.get();
101+
llama_context * ctx = llama_init.context.get();
102+
102103
if (model == NULL) {
103104
LOG_ERR("%s: unable to load model\n", __func__);
104105
return 1;
@@ -316,8 +317,6 @@ int main(int argc, char ** argv) {
316317

317318
// clean up
318319
llama_batch_free(batch);
319-
llama_free(ctx);
320-
llama_free_model(model);
321320
llama_backend_free();
322321

323322
return 0;

‎examples/eval-callback/eval-callback.cpp

Copy file name to clipboardExpand all lines: examples/eval-callback/eval-callback.cpp
+3-5Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,9 @@ int main(int argc, char ** argv) {
162162
// init
163163
common_init_result llama_init = common_init_from_params(params);
164164

165-
llama_model * model = llama_init.model;
166-
llama_context * ctx = llama_init.context;
165+
llama_model * model = llama_init.model.get();
166+
llama_context * ctx = llama_init.context.get();
167+
167168
if (model == nullptr || ctx == nullptr) {
168169
LOG_ERR("%s : failed to init\n", __func__);
169170
return 1;
@@ -184,9 +185,6 @@ int main(int argc, char ** argv) {
184185
LOG("\n");
185186
llama_perf_context_print(ctx);
186187

187-
llama_free(ctx);
188-
llama_free_model(model);
189-
190188
llama_backend_free();
191189

192190
return 0;

‎examples/gguf-split/gguf-split.cpp

Copy file name to clipboardExpand all lines: examples/gguf-split/gguf-split.cpp
+3-4Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22
#include "common.h"
33

44
#include <algorithm>
5-
#include <cmath>
65
#include <cstdlib>
76
#include <fstream>
87
#include <string>
98
#include <vector>
10-
11-
#include <stdio.h>
12-
#include <string.h>
139
#include <climits>
10+
11+
#include <cstdio>
12+
#include <cstring>
1413
#include <stdexcept>
1514

1615
#if defined(_WIN32)

‎examples/imatrix/imatrix.cpp

Copy file name to clipboardExpand all lines: examples/imatrix/imatrix.cpp
+5-6Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -430,9 +430,10 @@ static void process_logits(
430430

431431
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
432432
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
433-
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
434433
const int n_ctx = llama_n_ctx(ctx);
435434

435+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
436+
436437
auto tim1 = std::chrono::high_resolution_clock::now();
437438
LOG_INF("%s: tokenizing the input ..\n", __func__);
438439

@@ -618,8 +619,9 @@ int main(int argc, char ** argv) {
618619
// init
619620
common_init_result llama_init = common_init_from_params(params);
620621

621-
llama_model * model = llama_init.model;
622-
llama_context * ctx = llama_init.context;
622+
llama_model * model = llama_init.model.get();
623+
llama_context * ctx = llama_init.context.get();
624+
623625
if (model == nullptr || ctx == nullptr) {
624626
LOG_ERR("%s : failed to init\n", __func__);
625627
return 1;
@@ -655,9 +657,6 @@ int main(int argc, char ** argv) {
655657
LOG("\n");
656658
llama_perf_context_print(ctx);
657659

658-
llama_free(ctx);
659-
llama_free_model(model);
660-
661660
llama_backend_free();
662661

663662
return 0;

‎examples/infill/infill.cpp

Copy file name to clipboardExpand all lines: examples/infill/infill.cpp
+2-5Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ int main(int argc, char ** argv) {
131131
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
132132
common_init_result llama_init = common_init_from_params(params);
133133

134-
model = llama_init.model;
135-
ctx = llama_init.context;
134+
model = llama_init.model.get();
135+
ctx = llama_init.context.get();
136136

137137
if (model == NULL) {
138138
LOG_ERR("%s: unable to load model\n", __func__);
@@ -581,9 +581,6 @@ int main(int argc, char ** argv) {
581581
LOG("\n");
582582
common_perf_print(ctx, smpl);
583583

584-
llama_free(ctx);
585-
llama_free_model(model);
586-
587584
common_sampler_free(smpl);
588585
llama_backend_free();
589586

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.