Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 62944df

Browse filesBrowse files
Bugfix: Remove f16_kv, add offload_kqv field (abetlen#1019)
F16_KV appears to have been removed here: ggml-org/llama.cpp@af99c6f This addresses two issues: - abetlen#995 which just requests to add the KV cache offloading param - abetlen#1006 a NULL ptr exception when using the embeddings (introduced by leaving f16_kv in the fields struct)
1 parent 37da8e8 commit 62944df
Copy full SHA for 62944df

File tree

Expand file treeCollapse file tree

3 files changed

+3
-10
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+3
-10
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
-5Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,6 @@ def __init__(
751751
yarn_beta_slow: float = 1.0,
752752
yarn_orig_ctx: int = 0,
753753
mul_mat_q: bool = True,
754-
f16_kv: bool = True,
755754
logits_all: bool = False,
756755
embedding: bool = False,
757756
# Sampling Params
@@ -817,7 +816,6 @@ def __init__(
817816
yarn_beta_fast: YaRN low correction dim
818817
yarn_beta_slow: YaRN high correction dim
819818
yarn_orig_ctx: YaRN original context size
820-
f16_kv: Use fp16 for KV cache, fp32 otherwise
821819
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
822820
embedding: Embedding mode only.
823821
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
@@ -904,7 +902,6 @@ def __init__(
904902
)
905903
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
906904
self.context_params.mul_mat_q = mul_mat_q
907-
# self.context_params.f16_kv = f16_kv
908905
self.context_params.logits_all = logits_all
909906
self.context_params.embedding = embedding
910907

@@ -2155,7 +2152,6 @@ def __getstate__(self):
21552152
yarn_beta_slow=self.context_params.yarn_beta_slow,
21562153
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
21572154
mul_mat_q=self.context_params.mul_mat_q,
2158-
f16_kv=self.context_params.f16_kv,
21592155
logits_all=self.context_params.logits_all,
21602156
embedding=self.context_params.embedding,
21612157
# Sampling Params
@@ -2198,7 +2194,6 @@ def __setstate__(self, state):
21982194
yarn_beta_slow=state["yarn_beta_slow"],
21992195
yarn_orig_ctx=state["yarn_orig_ctx"],
22002196
mul_mat_q=state["mul_mat_q"],
2201-
f16_kv=state["f16_kv"],
22022197
logits_all=state["logits_all"],
22032198
embedding=state["embedding"],
22042199
# Sampling Params

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -432,9 +432,9 @@ class llama_context_params(Structure):
432432
type_k (int): data type for K cache
433433
type_v (int): data type for V cache
434434
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
435-
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
436435
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
437-
embedding (bool): embedding mode only"""
436+
embedding (bool): embedding mode only
437+
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
438438
_fields_ = [
439439
("seed", c_uint32),
440440
("n_ctx", c_uint32),
@@ -452,9 +452,9 @@ class llama_context_params(Structure):
452452
("type_k", c_int),
453453
("type_v", c_int),
454454
("mul_mat_q", c_bool),
455-
("f16_kv", c_bool),
456455
("logits_all", c_bool),
457456
("embedding", c_bool),
457+
("offload_kqv", c_bool),
458458
]
459459

460460

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
-2Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ class Settings(BaseSettings):
9898
mul_mat_q: bool = Field(
9999
default=True, description="if true, use experimental mul_mat_q kernels"
100100
)
101-
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
102101
logits_all: bool = Field(default=True, description="Whether to return logits.")
103102
embedding: bool = Field(default=True, description="Whether to use embeddings.")
104103
# Sampling Params
@@ -408,7 +407,6 @@ def create_app(settings: Optional[Settings] = None):
408407
yarn_beta_slow=settings.yarn_beta_slow,
409408
yarn_orig_ctx=settings.yarn_orig_ctx,
410409
mul_mat_q=settings.mul_mat_q,
411-
f16_kv=settings.f16_kv,
412410
logits_all=settings.logits_all,
413411
embedding=settings.embedding,
414412
# Sampling Params

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.