Limour-dev
diff --git a/‎llama_cpp/server/model.py
Copy file name to clipboardExpand all lines: llama_cpp/server/model.py
+3Lines changed: 3 additions & 0 deletions b/‎llama_cpp/server/model.py
Copy file name to clipboardExpand all lines: llama_cpp/server/model.py
+3Lines changed: 3 additions & 0 deletions
diff --git a/‎llama_cpp/server/settings.py
Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+9Lines changed: 9 additions & 0 deletions b/‎llama_cpp/server/settings.py
Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+9Lines changed: 9 additions & 0 deletions
@@ -175,6 +175,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             chat_handler=chat_handler,
             # Speculative Decoding
             draft_model=draft_model,
+            # KV Cache Quantization
+            type_k=settings.type_k,
+            type_v=settings.type_v,
             # Tokenizer
             tokenizer=tokenizer,
             # Misc
 
@@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
         default=10,
         description="Number of tokens to predict using the draft model.",
     )
+    # KV Cache Quantization
+    type_k: Optional[int] = Field(
+        default=None,
+        description="Type of the key cache quantization.",
+    )
+    type_v: Optional[int] = Field(
+        default=None,
+        description="Type of the value cache quantization.",
+    )
     # Misc
     verbose: bool = Field(
         default=True, description="Whether to print debug information."