tc-wolf · tc-wolf · Apr 22, 2025 · Apr 17, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/Makefile b/Makefile
@@ -92,9 +92,10 @@ deploy.pyinstaller.mac:

 	# This still builds with metal support (I think b/c GGML_NATIVE=ON). Not an
 	# issue since can still run Q4_0 models w/ repacking support on CPU if `-ngl 0`.
+	CMAKE_BUILD_TYPE="Release" \
 	CMAKE_ARGS="-DGGML_METAL=OFF -DGGML_LLAMAFILE=OFF -DGGML_BLAS=OFF \
-	-DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON \
-	-DCMAKE_BUILD_TYPE=Release" python3 -m pip install -v -e .[server,dev]
+	-DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON" \
+	python3 -m pip install -v -e .[server,pyinstaller]
 	@server_path=$$(python -c 'import llama_cpp.server; print(llama_cpp.server.__file__)' | sed s/init/main/) ; \
 	echo "Server path: $$server_path" ; \
 	base_path=$$(python -c 'from llama_cpp._ggml import libggml_base_path; print(str(libggml_base_path))') ; \

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -546,6 +546,12 @@ def free_lora_adapter():

        self._sampler = None

+        # Created formatted prompt path, used for storing formatted prompts as NDJSON
+        if (formatted_prompt_path := kwargs.get("formatted_prompt_path")) is not None:
+            self.formatted_prompt_path = formatted_prompt_path
+        else:
+            self.formatted_prompt_path = None
+
    @property
    def ctx(self) -> llama_cpp.llama_context_p:
        return self._ctx.ctx
@@ -2178,6 +2184,8 @@ def __getstate__(self):
            # Misc
            spm_infill=self.spm_infill,
            verbose=self.verbose,
+            # Path provided for prompt serialization, if any
+            formatted_prompt_path=self.formatted_prompt_path,
        )

    def __setstate__(self, state):

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -6,6 +6,7 @@
 import ctypes
 import dataclasses
 import random
+import pathlib
 import string

 from contextlib import ExitStack
@@ -24,6 +25,7 @@

 import jinja2
 from jinja2.sandbox import ImmutableSandboxedEnvironment
+import filelock

 import numpy as np
 import numpy.typing as npt
@@ -279,11 +281,15 @@ def _convert_text_completion_logprobs_to_chat(
                    }
                    for top_token, top_logprob in top_logprobs.items()
                ],
-            } for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"])
+            }
+            for (token, logprob, top_logprobs) in zip(
+                logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]
+            )
        ],
        "refusal": None,
    }

+
 def _convert_text_completion_to_chat(
    completion: llama_types.Completion,
 ) -> llama_types.ChatCompletion:
@@ -300,7 +306,9 @@ def _convert_text_completion_to_chat(
                    "role": "assistant",
                    "content": completion["choices"][0]["text"],
                },
-                "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
                "finish_reason": completion["choices"][0]["finish_reason"],
            }
        ],
@@ -344,7 +352,9 @@ def _convert_text_completion_chunks_to_chat(
                        if chunk["choices"][0]["finish_reason"] is None
                        else {}
                    ),
-                    "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        chunk["choices"][0]["logprobs"]
+                    ),
                    "finish_reason": chunk["choices"][0]["finish_reason"],
                }
            ],
@@ -407,7 +417,9 @@ def _convert_completion_to_chat_function(
                            }
                        ],
                    },
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                    "finish_reason": "tool_calls",
                }
            ],
@@ -460,7 +472,9 @@ def _stream_response_to_function_stream(
                            {
                                "index": 0,
                                "finish_reason": None,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                "delta": {
                                    "role": None,
                                    "content": None,
@@ -497,7 +511,9 @@ def _stream_response_to_function_stream(
                        {
                            "index": 0,
                            "finish_reason": None,
-                            "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                            "logprobs": _convert_text_completion_logprobs_to_chat(
+                                chunk["choices"][0]["logprobs"]
+                            ),
                            "delta": {
                                "role": None,
                                "content": None,
@@ -598,6 +614,19 @@ def chat_completion_handler(
            add_bos=not result.added_special,
            special=True,
        )
+
+        # Is there a way to ensure this is not set for production? This will
+        # slow down things at least a little (latency) because I/O is slow.
+        if llama.formatted_prompt_path is not None:
+            output_path = pathlib.Path(llama.formatted_prompt_path)
+
+            # We ensure that output path ends with .ndjson in pydantic validation.
+            lockfile_path = output_path.with_suffix(".lock")
+            with filelock.FileLock(str(lockfile_path)):
+                with output_path.open("a", encoding="utf-8") as f:
+                    json.dump({"prompt": result.prompt, "prompt_tokens": prompt}, f)
+                    f.write("\n")
+
        if result.stop is not None:
            stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
            rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@@ -695,7 +724,7 @@ def chat_completion_handler(


 def hf_autotokenizer_to_chat_formatter(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> ChatFormatter:
    # https://huggingface.co/docs/transformers/main/chat_templating
    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
@@ -720,7 +749,7 @@ def format_autotokenizer(


 def hf_autotokenizer_to_chat_completion_handler(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> LlamaChatCompletionHandler:
    chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path)
    return chat_formatter_to_chat_completion_handler(chat_formatter)
@@ -1790,7 +1819,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
                        }
                    ],
                },
-                "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
                "finish_reason": "tool_calls",
            }
        ],
@@ -2202,7 +2233,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                        choices=[
                            {
                                "index": 0,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                "delta": {
                                    "role": None,
                                    "content": None,
@@ -2304,7 +2337,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                        choices=[
                            {
                                "index": 0,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                "delta": {
                                    "role": "assistant",
                                    "content": None,
@@ -2342,7 +2377,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                        choices=[
                                            {
                                                "index": 0,
-                                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                                    chunk["choices"][0]["logprobs"]
+                                                ),
                                                "delta": {
                                                    "role": "assistant",
                                                    "content": buffer.pop(0),
@@ -2365,7 +2402,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                choices=[
                                    {
                                        "index": 0,
-                                        "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                        "logprobs": _convert_text_completion_logprobs_to_chat(
+                                            chunk["choices"][0]["logprobs"]
+                                        ),
                                        "delta": {
                                            "role": "assistant",
                                            "content": (
@@ -2451,7 +2490,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                choices=[
                                    {
                                        "index": 0,
-                                        "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                        "logprobs": _convert_text_completion_logprobs_to_chat(
+                                            chunk["choices"][0]["logprobs"]
+                                        ),
                                        "delta": {
                                            "role": None,
                                            "content": None,
@@ -2685,7 +2726,9 @@ def generate_streaming(tools, functions, function_call, prompt):
            choices=[
                {
                    "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                    "message": {
                        "role": "assistant",
                        "content": None if content == "" else content,
@@ -2795,9 +2838,7 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
            embed = self._llava_cpp.llava_image_embed_make_with_bytes(
                self.clip_ctx,
                n_threads_batch,
-                (ctypes.c_uint8 * len(image_bytes)).from_buffer(
-                    bytearray(image_bytes)
-                ),
+                (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
                len(image_bytes),
            )
            self._last_image_embed = embed
@@ -2869,7 +2910,6 @@ def __call__(
        if self.verbose:
            print(text, file=sys.stderr)

-
        # Evaluate prompt
        llama.reset()
        llama._ctx.kv_cache_clear()
@@ -2885,7 +2925,9 @@ def __call__(
                llama.eval(tokens)
            else:
                image_bytes = self.load_image(value)
-                embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
+                embed = self._embed_image_bytes(
+                    image_bytes, llama.context_params.n_threads_batch
+                )
                if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
                    raise ValueError(
                        f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
@@ -3404,7 +3446,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
-
        "{% for content in message['content'] %}"
        "{% if content.type == 'text' %}"
        "{{ content.text }}"
@@ -3817,7 +3858,9 @@ def chatml_function_calling(
                {
                    "finish_reason": "tool_calls",
                    "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                    "message": {
                        "role": "assistant",
                        "content": None,

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -223,6 +223,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
        import functools

        kwargs = {}
+        # Move this here so that works w/ llama_cpp.Llama.from_pretrained as
+        # well as 'normal' constructor.
+        kwargs["formatted_prompt_path"] = settings.formatted_prompt_path

        if settings.hf_model_repo_id is not None:
            create_fn = functools.partial(

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -188,6 +188,14 @@ class ModelSettings(BaseSettings):
        default=None,
        description="Type of the value cache quantization.",
    )
+
+    # Path to store formatted prompts as NDJSON
+    formatted_prompt_path: Optional[str] = Field(
+        default=None,
+        pattern=r".*\.ndjson$",
+        description="Output path to store formatted prompts as NDJSON.",
+    )
+
    # Misc
    verbose: bool = Field(
        default=True, description="Whether to print debug information."

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,11 +15,11 @@ dependencies = [
    "diskcache>=5.6.1",
    "jinja2>=2.11.3",
    "PyTrie>=0.4.0",
+    "filelock>=3.18.0",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
@@ -56,9 +56,11 @@ dev = [
    "httpx>=0.24.1",
    "pandas>=2.2.1",
    "tqdm>=4.66.2",
+]
+pyinstaller = [
    "pyinstaller>=6.11.1",
 ]
-all = ["llama_cpp_python[server,test,dev]"]
+all = ["llama_cpp_python[server,test,dev,pyinstaller]"]

 [tool.scikit-build]
 wheel.packages = ["llama_cpp"]

diff --git a/tests/test_settings.py b/tests/test_settings.py
@@ -0,0 +1,24 @@
+import pytest
+
+from llama_cpp.server.settings import ModelSettings
+from pydantic import ValidationError
+
+# Required to pass in model name
+DUMMY_MODEL_NAME = "foo"
+
+
+def test_formatted_prompt_path_default_none():
+    m = ModelSettings(model=DUMMY_MODEL_NAME)
+    assert m.formatted_prompt_path is None
+
+
+def test_validation_error_if_prompt_path_not_endswith_ndjson():
+    with pytest.raises(
+        ValidationError, match=r"String should match pattern '.*\\.ndjson\$'"
+    ):
+        ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="invalid_path.txt")
+
+
+def test_formatted_prompt_path_works_if_endswith_ndjson():
+    m = ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="valid_path.ndjson")
+    assert m.formatted_prompt_path == "valid_path.ndjson"