File tree Expand file tree Collapse file tree 2 files changed +7
-0
lines changed
Filter options
Expand file tree Collapse file tree 2 files changed +7
-0
lines changed
Original file line number Diff line number Diff line change @@ -752,6 +752,7 @@ def __init__(
752
752
mul_mat_q : bool = True ,
753
753
logits_all : bool = False ,
754
754
embedding : bool = False ,
755
+ offload_kqv : bool = False ,
755
756
# Sampling Params
756
757
last_n_tokens_size : int = 64 ,
757
758
# LoRA Params
@@ -817,6 +818,7 @@ def __init__(
817
818
yarn_orig_ctx: YaRN original context size
818
819
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
819
820
embedding: Embedding mode only.
821
+ offload_kqv: Offload K, Q, V to GPU.
820
822
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
821
823
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
822
824
lora_path: Path to a LoRA file to apply to the model.
@@ -903,6 +905,7 @@ def __init__(
903
905
self .context_params .mul_mat_q = mul_mat_q
904
906
self .context_params .logits_all = logits_all
905
907
self .context_params .embedding = embedding
908
+ self .context_params .offload_kqv = offload_kqv
906
909
907
910
# Sampling Params
908
911
self .last_n_tokens_size = last_n_tokens_size
Original file line number Diff line number Diff line change @@ -100,6 +100,9 @@ class Settings(BaseSettings):
100
100
)
101
101
logits_all : bool = Field (default = True , description = "Whether to return logits." )
102
102
embedding : bool = Field (default = True , description = "Whether to use embeddings." )
103
+ offload_kqv : bool = Field (
104
+ default = False , description = "Whether to offload kqv to the GPU."
105
+ )
103
106
# Sampling Params
104
107
last_n_tokens_size : int = Field (
105
108
default = 64 ,
@@ -409,6 +412,7 @@ def create_app(settings: Optional[Settings] = None):
409
412
mul_mat_q = settings .mul_mat_q ,
410
413
logits_all = settings .logits_all ,
411
414
embedding = settings .embedding ,
415
+ offload_kqv = settings .offload_kqv ,
412
416
# Sampling Params
413
417
last_n_tokens_size = settings .last_n_tokens_size ,
414
418
# LoRA Params
You can’t perform that action at this time.
0 commit comments