diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 11fe169cf5..210fb040b8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -107,6 +107,7 @@ def __init__( # Chat Format Params chat_format: Optional[str] = None, chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, + chat_template_kwargs: Optional[Dict[str, Any]] = None, # Speculative Decoding draft_model: Optional[LlamaDraftModel] = None, # Tokenizer Override @@ -185,6 +186,7 @@ def __init__( numa: numa policy chat_format: String specifying the chat format to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion. + chat_template_kwargs: Optional dict of keyword arguments forwarded to chat templates at model load time (e.g., enable_thinking=False for Qwen3.5 models). draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. verbose: Print verbose output to stderr. @@ -454,6 +456,11 @@ def free_lora_adapter(): str, llama_chat_format.LlamaChatCompletionHandler ] = {} + # Wrap chat_handler with chat_template_kwargs if provided + # Store chat_template_kwargs on self so it can be resolved at call time + # (after _chat_handlers is populated from model metadata) + self._chat_template_kwargs = chat_template_kwargs or {} + self.draft_model = draft_model self._n_vocab = self.n_vocab() @@ -1973,6 +1980,7 @@ def create_chat_completion( logit_bias: Optional[Dict[int, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + **kwargs: Any, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2014,6 +2022,8 @@ def create_chat_completion( or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(self.chat_format) ) + # chat_template_kwargs (from model load time) as defaults, + # but allow per-call kwargs to override them return handler( llama=self, messages=messages, @@ -2044,6 +2054,7 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + **{**self._chat_template_kwargs, **kwargs}, ) def create_chat_completion_openai_v1( @@ -2132,6 +2143,7 @@ def __getstate__(self): # Chat Format Params chat_format=self.chat_format, chat_handler=self.chat_handler, + chat_template_kwargs=self._chat_template_kwargs, # Speculative Decidng draft_model=self.draft_model, # KV cache quantization