samfundev
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+8-1Lines changed: 8 additions & 1 deletion b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+8-1Lines changed: 8 additions & 1 deletion
@@ -811,9 +811,16 @@ def _create_completion(
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
 
-        if len(prompt_tokens) + max_tokens > self._n_ctx:
+        if len(prompt_tokens) > self._n_ctx:
             raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
 
+        # Truncate max_tokens if requested tokens would exceed the context window
+        max_tokens = (
+            max_tokens
+            if max_tokens + len(prompt_tokens) < self._n_ctx
+            else (self._n_ctx - len(prompt_tokens))
+        )
+
         if stop != []:
             stop_sequences = [s.encode("utf-8") for s in stop]
         else: