zcfrank1st
diff --git a/Collapse file
‎README.md‎
Copy file name to clipboardExpand all lines: README.md
+10-1Lines changed: 10 additions & 1 deletion
Display the source diff
Display the rich diff b/Collapse file
‎README.md‎
Copy file name to clipboardExpand all lines: README.md
+10-1Lines changed: 10 additions & 1 deletion
Display the source diff
Display the rich diff
diff --git a/Collapse file
‎llama_cpp/llama.py‎
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+3-1Lines changed: 3 additions & 1 deletion b/Collapse file
‎llama_cpp/llama.py‎
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+3-1Lines changed: 3 additions & 1 deletion
@@ -286,7 +286,16 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest
 
 The high-level API also provides a simple interface for chat completion.
 
-Note that `chat_format` option must be set for the particular model you are using.
+Chat completion requires that the model know how to format the messages into a single prompt.
+The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
+
+The model will will format the messages into a single prompt using the following order of precedence:
+  - Use the `chat_handler` if provided
+  - Use the `chat_format` if provided
+  - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)
+  - else, fallback to the `llama-2` chat format
+
+Set `verbose=True` to see the selected chat format.
 
 ```python
 >>> from llama_cpp import Llama
 
@@ -410,7 +410,7 @@ def __init__(
                 bos_token = self._model.token_get_text(bos_token_id)
 
                 if self.verbose:
-                    print(f"Using chat template: {template}", file=sys.stderr)
+                    print(f"Using gguf chat template: {template}", file=sys.stderr)
                     print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
                     print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
 
@@ -420,6 +420,8 @@ def __init__(
 
         if self.chat_format is None and self.chat_handler is None:
             self.chat_format = "llama-2"
+            if self.verbose:
+                print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
 
     @property
     def ctx(self) -> llama_cpp.llama_context_p: