Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f1edc66

Browse filesBrowse files
committed
Update llama.cpp
1 parent f3b844e commit f1edc66
Copy full SHA for f1edc66

File tree

Expand file treeCollapse file tree

2 files changed

+46
-7
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+46
-7
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+45-6Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ def _load_shared_library(lib_base_name: str):
103103

104104
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
105105
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
106-
# define LLAMA_SESSION_VERSION 2
107-
LLAMA_SESSION_VERSION = 2
106+
# define LLAMA_SESSION_VERSION 3
107+
LLAMA_SESSION_VERSION = 3
108108

109109

110110
# struct llama_model;
@@ -309,6 +309,35 @@ class llama_batch(Structure):
309309
("all_seq_id", llama_seq_id),
310310
]
311311

312+
# enum llama_model_kv_override_type {
313+
# LLAMA_KV_OVERRIDE_INT,
314+
# LLAMA_KV_OVERRIDE_FLOAT,
315+
# LLAMA_KV_OVERRIDE_BOOL,
316+
# };
317+
class llama_model_kv_override_type(Structure):
318+
_fields_ = [
319+
("LLAMA_KV_OVERRIDE_INT", c_int),
320+
("LLAMA_KV_OVERRIDE_FLOAT", c_int),
321+
("LLAMA_KV_OVERRIDE_BOOL", c_int),
322+
]
323+
324+
# struct llama_model_kv_override {
325+
# char key[128];
326+
# enum llama_model_kv_override_type tag;
327+
# union {
328+
# int64_t int_value;
329+
# double float_value;
330+
# bool bool_value;
331+
# };
332+
# };
333+
class llama_model_kv_override(Structure):
334+
_fields_ = [
335+
("key", ctypes.c_char * 128),
336+
("tag", llama_model_kv_override_type),
337+
("int_value", ctypes.c_int64),
338+
("float_value", c_double),
339+
("bool_value", c_bool),
340+
]
312341

313342
# struct llama_model_params {
314343
# int32_t n_gpu_layers; // number of layers to store in VRAM
@@ -320,6 +349,8 @@ class llama_batch(Structure):
320349
# // context pointer passed to the progress callback
321350
# void * progress_callback_user_data;
322351

352+
# // override key-value pairs of the model meta data
353+
# const struct llama_model_kv_override * kv_overrides;
323354

324355
# // Keep the booleans together to avoid misalignment during copy-by-value.
325356
# bool vocab_only; // only load the vocabulary, no weights
@@ -335,6 +366,7 @@ class llama_model_params(Structure):
335366
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
336367
progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
337368
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
369+
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
338370
vocab_only (bool): only load the vocabulary, no weights
339371
use_mmap (bool): use mmap if possible
340372
use_mlock (bool): force system to keep model in RAM"""
@@ -344,6 +376,7 @@ class llama_model_params(Structure):
344376
("tensor_split", c_float_p),
345377
("progress_callback", llama_progress_callback),
346378
("progress_callback_user_data", c_void_p),
379+
("kv_overrides", POINTER(llama_model_kv_override)),
347380
("vocab_only", c_bool),
348381
("use_mmap", c_bool),
349382
("use_mlock", c_bool),
@@ -367,12 +400,14 @@ class llama_model_params(Structure):
367400
# float yarn_beta_slow; // YaRN high correction dim
368401
# uint32_t yarn_orig_ctx; // YaRN original context size
369402

403+
# enum ggml_type type_k; // data type for K cache
404+
# enum ggml_type type_v; // data type for V cache
370405

371406
# // Keep the booleans together to avoid misalignment during copy-by-value.
372-
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
373-
# bool f16_kv; // use fp16 for KV cache, fp32 otherwise
374-
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
375-
# bool embedding; // embedding mode only
407+
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
408+
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
409+
# bool embedding; // embedding mode only
410+
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
376411
# };
377412
class llama_context_params(Structure):
378413
"""Parameters for llama_context
@@ -391,6 +426,8 @@ class llama_context_params(Structure):
391426
yarn_beta_fast (float): YaRN low correction dim
392427
yarn_beta_slow (float): YaRN high correction dim
393428
yarn_orig_ctx (int): YaRN original context size
429+
type_k (int): data type for K cache
430+
type_v (int): data type for V cache
394431
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
395432
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
396433
logits_all (bool): the llama_eval() call computes all logits, not just the last one
@@ -409,6 +446,8 @@ class llama_context_params(Structure):
409446
("yarn_beta_fast", c_float),
410447
("yarn_beta_slow", c_float),
411448
("yarn_orig_ctx", c_uint32),
449+
("type_k", c_int),
450+
("type_v", c_int),
412451
("mul_mat_q", c_bool),
413452
("f16_kv", c_bool),
414453
("logits_all", c_bool),

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.