DmitrySidyakin
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+5Lines changed: 5 additions & 0 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+5Lines changed: 5 additions & 0 deletions
diff --git a/‎llama_cpp/server/model.py
Copy file name to clipboardExpand all lines: llama_cpp/server/model.py
+1Lines changed: 1 addition & 0 deletions b/‎llama_cpp/server/model.py
Copy file name to clipboardExpand all lines: llama_cpp/server/model.py
+1Lines changed: 1 addition & 0 deletions
diff --git a/‎llama_cpp/server/settings.py
Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+3Lines changed: 3 additions & 0 deletions b/‎llama_cpp/server/settings.py
Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+3Lines changed: 3 additions & 0 deletions
@@ -75,6 +75,7 @@ def __init__(
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
         n_batch: int = 512,
+        n_ubatch: int = 512,
         n_threads: Optional[int] = None,
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[
@@ -156,6 +157,7 @@ def __init__(
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
+            n_ubatch: Physical batch size
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -309,6 +311,7 @@ def __init__(
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.n_ctx = n_ctx
         self.context_params.n_batch = self.n_batch
+        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
         self.context_params.n_threads = self.n_threads
         self.context_params.n_threads_batch = self.n_threads_batch
         self.context_params.rope_scaling_type = (
@@ -380,6 +383,7 @@ def __init__(
             self.n_batch = min(n_ctx, n_batch)
             self.context_params.n_ctx = self._model.n_ctx_train()
             self.context_params.n_batch = self.n_batch
+            self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
 
         self._ctx = self._stack.enter_context(
             contextlib.closing(
@@ -2071,6 +2075,7 @@ def __getstate__(self):
             seed=self.context_params.seed,
             n_ctx=self.context_params.n_ctx,
             n_batch=self.n_batch,
+            n_ubatch=self.context_params.n_ubatch,
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
 
@@ -249,6 +249,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             seed=settings.seed,
             n_ctx=settings.n_ctx,
             n_batch=settings.n_batch,
+            n_ubatch=settings.n_ubatch,
             n_threads=settings.n_threads,
             n_threads_batch=settings.n_threads_batch,
             rope_scaling_type=settings.rope_scaling_type,
 
@@ -70,6 +70,9 @@ class ModelSettings(BaseSettings):
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
+    n_ubatch: int = Field(
+        default=512, ge=1, description="The physical batch size used by llama.cpp"
+    )
     n_threads: int = Field(
         default=max(multiprocessing.cpu_count() // 2, 1),
         ge=1,