Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 095c650

Browse filesBrowse files
committed
Add offload_kqv option to llama and server
1 parent 472b344 commit 095c650
Copy full SHA for 095c650

File tree

Expand file treeCollapse file tree

2 files changed

+7
-0
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+7
-0
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,7 @@ def __init__(
752752
mul_mat_q: bool = True,
753753
logits_all: bool = False,
754754
embedding: bool = False,
755+
offload_kqv: bool = False,
755756
# Sampling Params
756757
last_n_tokens_size: int = 64,
757758
# LoRA Params
@@ -817,6 +818,7 @@ def __init__(
817818
yarn_orig_ctx: YaRN original context size
818819
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
819820
embedding: Embedding mode only.
821+
offload_kqv: Offload K, Q, V to GPU.
820822
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
821823
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
822824
lora_path: Path to a LoRA file to apply to the model.
@@ -903,6 +905,7 @@ def __init__(
903905
self.context_params.mul_mat_q = mul_mat_q
904906
self.context_params.logits_all = logits_all
905907
self.context_params.embedding = embedding
908+
self.context_params.offload_kqv = offload_kqv
906909

907910
# Sampling Params
908911
self.last_n_tokens_size = last_n_tokens_size

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ class Settings(BaseSettings):
100100
)
101101
logits_all: bool = Field(default=True, description="Whether to return logits.")
102102
embedding: bool = Field(default=True, description="Whether to use embeddings.")
103+
offload_kqv: bool = Field(
104+
default=False, description="Whether to offload kqv to the GPU."
105+
)
103106
# Sampling Params
104107
last_n_tokens_size: int = Field(
105108
default=64,
@@ -409,6 +412,7 @@ def create_app(settings: Optional[Settings] = None):
409412
mul_mat_q=settings.mul_mat_q,
410413
logits_all=settings.logits_all,
411414
embedding=settings.embedding,
415+
offload_kqv=settings.offload_kqv,
412416
# Sampling Params
413417
last_n_tokens_size=settings.last_n_tokens_size,
414418
# LoRA Params

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.