Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 8c2b24d

Browse filesBrowse files
committed
feat: Update llama.cpp
1 parent 6332527 commit 8c2b24d
Copy full SHA for 8c2b24d

File tree

Expand file treeCollapse file tree

2 files changed

+8
-4
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+8
-4
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+7-3Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
242242

243243
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
244244
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
245-
# define LLAMA_SESSION_VERSION 5
246-
LLAMA_SESSION_VERSION = 5
245+
# define LLAMA_SESSION_VERSION 6
246+
LLAMA_SESSION_VERSION = 6
247247

248248
# define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
249249
LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
@@ -730,6 +730,7 @@ class llama_model_params(ctypes.Structure):
730730
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
731731
# bool embeddings; // if true, extract embeddings (together with logits)
732732
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
733+
# bool flash_attn; // whether to use flash attention
733734

734735

735736
# // Abort callback
@@ -766,6 +767,7 @@ class llama_context_params(ctypes.Structure):
766767
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
767768
embeddings (bool): if true, extract embeddings (together with logits)
768769
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
770+
flash_attn (bool): whether to use flash attention
769771
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
770772
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
771773
"""
@@ -795,6 +797,7 @@ class llama_context_params(ctypes.Structure):
795797
logits_all: bool
796798
embeddings: bool
797799
offload_kqv: bool
800+
flash_attn: bool
798801
abort_callback: Callable[[ctypes.c_void_p], bool]
799802
abort_callback_data: ctypes.c_void_p
800803

@@ -823,6 +826,7 @@ class llama_context_params(ctypes.Structure):
823826
("logits_all", ctypes.c_bool),
824827
("embeddings", ctypes.c_bool),
825828
("offload_kqv", ctypes.c_bool),
829+
("flash_attn", ctypes.c_bool),
826830
("abort_callback", ggml_abort_callback),
827831
("abort_callback_data", ctypes.c_void_p),
828832
]
@@ -1615,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
16151619
...
16161620

16171621

1618-
# // Clear the KV cache
1622+
# // Clear the KV cache - both cell info is erased and KV data is zeroed
16191623
# LLAMA_API void llama_kv_cache_clear(
16201624
# struct llama_context * ctx);
16211625
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.