tc-wolf
diff --git a/‎Makefile
Copy file name to clipboardExpand all lines: Makefile
+3-2Lines changed: 3 additions & 2 deletions b/‎Makefile
Copy file name to clipboardExpand all lines: Makefile
+3-2Lines changed: 3 additions & 2 deletions
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+8Lines changed: 8 additions & 0 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+8Lines changed: 8 additions & 0 deletions
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+65-22Lines changed: 65 additions & 22 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+65-22Lines changed: 65 additions & 22 deletions
diff --git a/‎llama_cpp/server/model.py
Copy file name to clipboardExpand all lines: llama_cpp/server/model.py
+3Lines changed: 3 additions & 0 deletions b/‎llama_cpp/server/model.py
Copy file name to clipboardExpand all lines: llama_cpp/server/model.py
+3Lines changed: 3 additions & 0 deletions
diff --git a/‎llama_cpp/server/settings.py
Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+8Lines changed: 8 additions & 0 deletions b/‎llama_cpp/server/settings.py
Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+8Lines changed: 8 additions & 0 deletions
diff --git a/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+5-3Lines changed: 5 additions & 3 deletions b/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+5-3Lines changed: 5 additions & 3 deletions
diff --git a/‎tests/test_settings.py
Copy file name to clipboard
+24Lines changed: 24 additions & 0 deletions b/‎tests/test_settings.py
Copy file name to clipboard
+24Lines changed: 24 additions & 0 deletions
@@ -92,9 +92,10 @@ deploy.pyinstaller.mac:
 
 	# This still builds with metal support (I think b/c GGML_NATIVE=ON). Not an
 	# issue since can still run Q4_0 models w/ repacking support on CPU if `-ngl 0`.
+	CMAKE_BUILD_TYPE="Release" \
 	CMAKE_ARGS="-DGGML_METAL=OFF -DGGML_LLAMAFILE=OFF -DGGML_BLAS=OFF \
-	-DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON \
-	-DCMAKE_BUILD_TYPE=Release" python3 -m pip install -v -e .[server,dev]
+	-DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON" \
+	python3 -m pip install -v -e .[server,pyinstaller]
 	@server_path=$$(python -c 'import llama_cpp.server; print(llama_cpp.server.__file__)' | sed s/init/main/) ; \
 	echo "Server path: $$server_path" ; \
 	base_path=$$(python -c 'from llama_cpp._ggml import libggml_base_path; print(str(libggml_base_path))') ; \
 
@@ -546,6 +546,12 @@ def free_lora_adapter():
 
         self._sampler = None
 
+        # Created formatted prompt path, used for storing formatted prompts as NDJSON
+        if (formatted_prompt_path := kwargs.get("formatted_prompt_path")) is not None:
+            self.formatted_prompt_path = formatted_prompt_path
+        else:
+            self.formatted_prompt_path = None
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         return self._ctx.ctx
@@ -2178,6 +2184,8 @@ def __getstate__(self):
             # Misc
             spm_infill=self.spm_infill,
             verbose=self.verbose,
+            # Path provided for prompt serialization, if any
+            formatted_prompt_path=self.formatted_prompt_path,
         )
 
     def __setstate__(self, state):
 
@@ -6,6 +6,7 @@
 import ctypes
 import dataclasses
 import random
+import pathlib
 import string
 
 from contextlib import ExitStack
@@ -24,6 +25,7 @@
 
 import jinja2
 from jinja2.sandbox import ImmutableSandboxedEnvironment
+import filelock
 
 import numpy as np
 import numpy.typing as npt
@@ -279,11 +281,15 @@ def _convert_text_completion_logprobs_to_chat(
                     }
                     for top_token, top_logprob in top_logprobs.items()
                 ],
-            } for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"])
+            }
+            for (token, logprob, top_logprobs) in zip(
+                logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]
+            )
         ],
         "refusal": None,
     }
 
+
 def _convert_text_completion_to_chat(
     completion: llama_types.Completion,
 ) -> llama_types.ChatCompletion:
@@ -300,7 +306,9 @@ def _convert_text_completion_to_chat(
                     "role": "assistant",
                     "content": completion["choices"][0]["text"],
                 },
-                "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
                 "finish_reason": completion["choices"][0]["finish_reason"],
             }
         ],
@@ -344,7 +352,9 @@ def _convert_text_completion_chunks_to_chat(
                         if chunk["choices"][0]["finish_reason"] is None
                         else {}
                     ),
-                    "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        chunk["choices"][0]["logprobs"]
+                    ),
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
@@ -407,7 +417,9 @@ def _convert_completion_to_chat_function(
                             }
                         ],
                     },
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                     "finish_reason": "tool_calls",
                 }
             ],
@@ -460,7 +472,9 @@ def _stream_response_to_function_stream(
                             {
                                 "index": 0,
                                 "finish_reason": None,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                 "delta": {
                                     "role": None,
                                     "content": None,
@@ -497,7 +511,9 @@ def _stream_response_to_function_stream(
                         {
                             "index": 0,
                             "finish_reason": None,
-                            "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                            "logprobs": _convert_text_completion_logprobs_to_chat(
+                                chunk["choices"][0]["logprobs"]
+                            ),
                             "delta": {
                                 "role": None,
                                 "content": None,
@@ -598,6 +614,19 @@ def chat_completion_handler(
             add_bos=not result.added_special,
             special=True,
         )
+
+        # Is there a way to ensure this is not set for production? This will
+        # slow down things at least a little (latency) because I/O is slow.
+        if llama.formatted_prompt_path is not None:
+            output_path = pathlib.Path(llama.formatted_prompt_path)
+
+            # We ensure that output path ends with .ndjson in pydantic validation.
+            lockfile_path = output_path.with_suffix(".lock")
+            with filelock.FileLock(str(lockfile_path)):
+                with output_path.open("a", encoding="utf-8") as f:
+                    json.dump({"prompt": result.prompt, "prompt_tokens": prompt}, f)
+                    f.write("\n")
+
         if result.stop is not None:
             stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
             rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@@ -695,7 +724,7 @@ def chat_completion_handler(
 
 
 def hf_autotokenizer_to_chat_formatter(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> ChatFormatter:
     # https://huggingface.co/docs/transformers/main/chat_templating
     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
@@ -720,7 +749,7 @@ def format_autotokenizer(
 
 
 def hf_autotokenizer_to_chat_completion_handler(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> LlamaChatCompletionHandler:
     chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path)
     return chat_formatter_to_chat_completion_handler(chat_formatter)
@@ -1790,7 +1819,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
                         }
                     ],
                 },
-                "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
                 "finish_reason": "tool_calls",
             }
         ],
@@ -2202,7 +2233,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                         choices=[
                             {
                                 "index": 0,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                 "delta": {
                                     "role": None,
                                     "content": None,
@@ -2304,7 +2337,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                         choices=[
                             {
                                 "index": 0,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                 "delta": {
                                     "role": "assistant",
                                     "content": None,
@@ -2342,7 +2377,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         choices=[
                                             {
                                                 "index": 0,
-                                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                                    chunk["choices"][0]["logprobs"]
+                                                ),
                                                 "delta": {
                                                     "role": "assistant",
                                                     "content": buffer.pop(0),
@@ -2365,7 +2402,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 choices=[
                                     {
                                         "index": 0,
-                                        "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                        "logprobs": _convert_text_completion_logprobs_to_chat(
+                                            chunk["choices"][0]["logprobs"]
+                                        ),
                                         "delta": {
                                             "role": "assistant",
                                             "content": (
@@ -2451,7 +2490,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 choices=[
                                     {
                                         "index": 0,
-                                        "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                        "logprobs": _convert_text_completion_logprobs_to_chat(
+                                            chunk["choices"][0]["logprobs"]
+                                        ),
                                         "delta": {
                                             "role": None,
                                             "content": None,
@@ -2685,7 +2726,9 @@ def generate_streaming(tools, functions, function_call, prompt):
             choices=[
                 {
                     "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                     "message": {
                         "role": "assistant",
                         "content": None if content == "" else content,
@@ -2795,9 +2838,7 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
             embed = self._llava_cpp.llava_image_embed_make_with_bytes(
                 self.clip_ctx,
                 n_threads_batch,
-                (ctypes.c_uint8 * len(image_bytes)).from_buffer(
-                    bytearray(image_bytes)
-                ),
+                (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
                 len(image_bytes),
             )
             self._last_image_embed = embed
@@ -2869,7 +2910,6 @@ def __call__(
         if self.verbose:
             print(text, file=sys.stderr)
 
-
         # Evaluate prompt
         llama.reset()
         llama._ctx.kv_cache_clear()
@@ -2885,7 +2925,9 @@ def __call__(
                 llama.eval(tokens)
             else:
                 image_bytes = self.load_image(value)
-                embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
+                embed = self._embed_image_bytes(
+                    image_bytes, llama.context_params.n_threads_batch
+                )
                 if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
                     raise ValueError(
                         f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
@@ -3404,7 +3446,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
         "{% endif %}"
         "{% endif %}"
         "{% endfor %}"
-
         "{% for content in message['content'] %}"
         "{% if content.type == 'text' %}"
         "{{ content.text }}"
@@ -3817,7 +3858,9 @@ def chatml_function_calling(
                 {
                     "finish_reason": "tool_calls",
                     "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                     "message": {
                         "role": "assistant",
                         "content": None,
 
@@ -223,6 +223,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
         import functools
 
         kwargs = {}
+        # Move this here so that works w/ llama_cpp.Llama.from_pretrained as
+        # well as 'normal' constructor.
+        kwargs["formatted_prompt_path"] = settings.formatted_prompt_path
 
         if settings.hf_model_repo_id is not None:
             create_fn = functools.partial(
 
@@ -188,6 +188,14 @@ class ModelSettings(BaseSettings):
         default=None,
         description="Type of the value cache quantization.",
     )
+
+    # Path to store formatted prompts as NDJSON
+    formatted_prompt_path: Optional[str] = Field(
+        default=None,
+        pattern=r".*\.ndjson$",
+        description="Output path to store formatted prompts as NDJSON.",
+    )
+
     # Misc
     verbose: bool = Field(
         default=True, description="Whether to print debug information."
 
@@ -15,11 +15,11 @@ dependencies = [
     "diskcache>=5.6.1",
     "jinja2>=2.11.3",
     "PyTrie>=0.4.0",
+    "filelock>=3.18.0",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -56,9 +56,11 @@ dev = [
     "httpx>=0.24.1",
     "pandas>=2.2.1",
     "tqdm>=4.66.2",
+]
+pyinstaller = [
     "pyinstaller>=6.11.1",
 ]
-all = ["llama_cpp_python[server,test,dev]"]
+all = ["llama_cpp_python[server,test,dev,pyinstaller]"]
 
 [tool.scikit-build]
 wheel.packages = ["llama_cpp"]
 
@@ -0,0 +1,24 @@
+import pytest
+
+from llama_cpp.server.settings import ModelSettings
+from pydantic import ValidationError
+
+# Required to pass in model name
+DUMMY_MODEL_NAME = "foo"
+
+
+def test_formatted_prompt_path_default_none():
+    m = ModelSettings(model=DUMMY_MODEL_NAME)
+    assert m.formatted_prompt_path is None
+
+
+def test_validation_error_if_prompt_path_not_endswith_ndjson():
+    with pytest.raises(
+        ValidationError, match=r"String should match pattern '.*\\.ndjson\$'"
+    ):
+        ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="invalid_path.txt")
+
+
+def test_formatted_prompt_path_works_if_endswith_ndjson():
+    m = ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="valid_path.ndjson")
+    assert m.formatted_prompt_path == "valid_path.ndjson"