Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 87a6e57

Browse filesBrowse files
committed
feat: Update llama.cpp
1 parent 13177aa commit 87a6e57
Copy full SHA for 87a6e57

File tree

Expand file treeCollapse file tree

2 files changed

+10
-6
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+10
-6
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+9-5Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,10 +320,12 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
320320
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
321321

322322
# enum llama_pooling_type {
323+
# LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
323324
# LLAMA_POOLING_TYPE_NONE = 0,
324325
# LLAMA_POOLING_TYPE_MEAN = 1,
325326
# LLAMA_POOLING_TYPE_CLS = 2,
326327
# };
328+
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
327329
LLAMA_POOLING_TYPE_NONE = 0
328330
LLAMA_POOLING_TYPE_MEAN = 1
329331
LLAMA_POOLING_TYPE_CLS = 2
@@ -547,7 +549,10 @@ class llama_model_params(ctypes.Structure):
547549
# uint32_t n_batch; // prompt processing maximum batch size
548550
# uint32_t n_threads; // number of threads to use for generation
549551
# uint32_t n_threads_batch; // number of threads to use for batch processing
550-
# int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
552+
553+
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
554+
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
555+
# // (ignored if no pooling layer)
551556

552557
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
553558
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -569,7 +574,6 @@ class llama_model_params(ctypes.Structure):
569574
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
570575
# bool embedding; // embedding mode only
571576
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
572-
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
573577

574578
# // Abort callback
575579
# // if it returns true, execution of llama_decode() will be aborted
@@ -587,6 +591,7 @@ class llama_context_params(ctypes.Structure):
587591
n_threads (int): number of threads to use for generation
588592
n_threads_batch (int): number of threads to use for batch processing
589593
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
594+
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
590595
rope_freq_base (float): RoPE base frequency, 0 = from model
591596
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
592597
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -602,7 +607,6 @@ class llama_context_params(ctypes.Structure):
602607
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
603608
embedding (bool): embedding mode only
604609
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
605-
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
606610
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
607611
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
608612
"""
@@ -613,7 +617,8 @@ class llama_context_params(ctypes.Structure):
613617
("n_batch", ctypes.c_uint32),
614618
("n_threads", ctypes.c_uint32),
615619
("n_threads_batch", ctypes.c_uint32),
616-
("rope_scaling_type", ctypes.c_int32),
620+
("rope_scaling_type", ctypes.c_int),
621+
("pooling_type", ctypes.c_int),
617622
("rope_freq_base", ctypes.c_float),
618623
("rope_freq_scale", ctypes.c_float),
619624
("yarn_ext_factor", ctypes.c_float),
@@ -629,7 +634,6 @@ class llama_context_params(ctypes.Structure):
629634
("logits_all", ctypes.c_bool),
630635
("embedding", ctypes.c_bool),
631636
("offload_kqv", ctypes.c_bool),
632-
("do_pooling", ctypes.c_bool),
633637
("abort_callback", ggml_abort_callback),
634638
("abort_callback_data", ctypes.c_void_p),
635639
]

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.