Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 4442ff8

Browse filesBrowse files
shakalacaabetlen
andauthored
fix: error showing time spent in llama perf context print (abetlen#1898)
* feat: Sync with llama.cpp Add `no_perf` field to `llama_context_params` to optionally disable performance timing measurements. * fix: Display performance metrics by default --------- Co-authored-by: Andrei <abetlen@gmail.com>
1 parent 14879c7 commit 4442ff8
Copy full SHA for 4442ff8

File tree

Expand file treeCollapse file tree

2 files changed

+7
-0
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+7
-0
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def __init__(
9494
offload_kqv: bool = True,
9595
flash_attn: bool = False,
9696
# Sampling Params
97+
no_perf: bool = False,
9798
last_n_tokens_size: int = 64,
9899
# LoRA Params
99100
lora_base: Optional[str] = None,
@@ -173,6 +174,7 @@ def __init__(
173174
embedding: Embedding mode only.
174175
offload_kqv: Offload K, Q, V to GPU.
175176
flash_attn: Use flash attention.
177+
no_perf: Measure performance timings.
176178
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
177179
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
178180
lora_path: Path to a LoRA file to apply to the model.
@@ -351,6 +353,7 @@ def __init__(
351353
if type_v is not None:
352354
self.context_params.type_v = type_v
353355
# Sampling Params
356+
self.context_params.no_perf = no_perf
354357
self.last_n_tokens_size = last_n_tokens_size
355358

356359
self.cache: Optional[BaseLlamaCache] = None
@@ -2093,6 +2096,7 @@ def __getstate__(self):
20932096
offload_kqv=self.context_params.offload_kqv,
20942097
flash_attn=self.context_params.flash_attn,
20952098
# Sampling Params
2099+
no_perf=self.context_params.no_perf,
20962100
last_n_tokens_size=self.last_n_tokens_size,
20972101
# LoRA Params
20982102
lora_base=self.lora_base,

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,7 @@ class llama_context_params(ctypes.Structure):
780780
embeddings (bool): if true, extract embeddings (together with logits)
781781
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
782782
flash_attn (bool): whether to use flash attention
783+
no_perf (bool): whether to measure performance timings
783784
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
784785
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
785786
"""
@@ -810,6 +811,7 @@ class llama_context_params(ctypes.Structure):
810811
embeddings: bool
811812
offload_kqv: bool
812813
flash_attn: bool
814+
no_perf: bool
813815
abort_callback: Callable[[ctypes.c_void_p], bool]
814816
abort_callback_data: ctypes.c_void_p
815817

@@ -839,6 +841,7 @@ class llama_context_params(ctypes.Structure):
839841
("embeddings", ctypes.c_bool),
840842
("offload_kqv", ctypes.c_bool),
841843
("flash_attn", ctypes.c_bool),
844+
("no_perf", ctypes.c_bool),
842845
("abort_callback", ggml_abort_callback),
843846
("abort_callback_data", ctypes.c_void_p),
844847
]

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.