Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 7c898d5

Browse filesBrowse files
committed
Update llama.cpp
1 parent bb610b9 commit 7c898d5
Copy full SHA for 7c898d5

File tree

Expand file treeCollapse file tree

2 files changed

+26
-8
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+26
-8
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+25-7Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ def _load_shared_library(lib_base_name: str):
112112

113113
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
114114
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
115-
# define LLAMA_SESSION_VERSION 3
116-
LLAMA_SESSION_VERSION = 3
115+
# define LLAMA_SESSION_VERSION 4
116+
LLAMA_SESSION_VERSION = 4
117117

118118

119119
# struct llama_model;
@@ -220,6 +220,14 @@ def _load_shared_library(lib_base_name: str):
220220
LLAMA_ROPE_SCALING_YARN = 2
221221
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
222222

223+
# enum llama_split_mode {
224+
# LLAMA_SPLIT_NONE = 0, // single GPU
225+
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
226+
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
227+
# };
228+
LLAMA_SPLIT_NONE = 0
229+
LLAMA_SPLIT_LAYER = 1
230+
LLAMA_SPLIT_ROW = 2
223231

224232
# typedef struct llama_token_data {
225233
# llama_token id; // token id
@@ -365,20 +373,28 @@ class llama_model_kv_override(Structure):
365373

366374
# struct llama_model_params {
367375
# int32_t n_gpu_layers; // number of layers to store in VRAM
368-
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
369-
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
376+
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
377+
378+
# // main_gpu interpretation depends on split_mode:
379+
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
380+
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
381+
# // LLAMA_SPLIT_LAYER: ignored
382+
# int32_t main_gpu;
383+
384+
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
385+
# const float * tensor_split;
370386

371387
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
372388
# // If the provided progress_callback returns true, model loading continues.
373389
# // If it returns false, model loading is immediately aborted.
374390
# llama_progress_callback progress_callback;
391+
375392
# // context pointer passed to the progress callback
376393
# void * progress_callback_user_data;
377394

378395
# // override key-value pairs of the model meta data
379396
# const struct llama_model_kv_override * kv_overrides;
380397

381-
382398
# // Keep the booleans together to avoid misalignment during copy-by-value.
383399
# bool vocab_only; // only load the vocabulary, no weights
384400
# bool use_mmap; // use mmap if possible
@@ -389,8 +405,9 @@ class llama_model_params(Structure):
389405
390406
Attributes:
391407
n_gpu_layers (int): number of layers to store in VRAM
392-
main_gpu (int): the GPU that is used for scratch and small tensors
393-
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
408+
split_mode (int): how to split the model across multiple GPUs
409+
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
410+
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
394411
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
395412
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
396413
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -400,6 +417,7 @@ class llama_model_params(Structure):
400417

401418
_fields_ = [
402419
("n_gpu_layers", c_int32),
420+
("split_mode", c_int),
403421
("main_gpu", c_int32),
404422
("tensor_split", c_float_p),
405423
("progress_callback", llama_progress_callback),

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.