diff --git a/Makefile b/Makefile index 841a0f58f..4c273737c 100644 --- a/Makefile +++ b/Makefile @@ -92,9 +92,10 @@ deploy.pyinstaller.mac: # This still builds with metal support (I think b/c GGML_NATIVE=ON). Not an # issue since can still run Q4_0 models w/ repacking support on CPU if `-ngl 0`. + CMAKE_BUILD_TYPE="Release" \ CMAKE_ARGS="-DGGML_METAL=OFF -DGGML_LLAMAFILE=OFF -DGGML_BLAS=OFF \ - -DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON \ - -DCMAKE_BUILD_TYPE=Release" python3 -m pip install -v -e .[server,dev] + -DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON" \ + python3 -m pip install -v -e .[server,pyinstaller] @server_path=$$(python -c 'import llama_cpp.server; print(llama_cpp.server.__file__)' | sed s/init/main/) ; \ echo "Server path: $$server_path" ; \ base_path=$$(python -c 'from llama_cpp._ggml import libggml_base_path; print(str(libggml_base_path))') ; \ diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 96f0717a5..fcc3d75cf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -546,6 +546,12 @@ def free_lora_adapter(): self._sampler = None + # Created formatted prompt path, used for storing formatted prompts as NDJSON + if (formatted_prompt_path := kwargs.get("formatted_prompt_path")) is not None: + self.formatted_prompt_path = formatted_prompt_path + else: + self.formatted_prompt_path = None + @property def ctx(self) -> llama_cpp.llama_context_p: return self._ctx.ctx @@ -2178,6 +2184,8 @@ def __getstate__(self): # Misc spm_infill=self.spm_infill, verbose=self.verbose, + # Path provided for prompt serialization, if any + formatted_prompt_path=self.formatted_prompt_path, ) def __setstate__(self, state): diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 5b6e9ac3f..575a85f2b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -6,6 +6,7 @@ import ctypes import dataclasses import random +import pathlib import string from contextlib import ExitStack @@ -24,6 +25,7 @@ import jinja2 from jinja2.sandbox import ImmutableSandboxedEnvironment +import filelock import numpy as np import numpy.typing as npt @@ -279,11 +281,15 @@ def _convert_text_completion_logprobs_to_chat( } for top_token, top_logprob in top_logprobs.items() ], - } for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]) + } + for (token, logprob, top_logprobs) in zip( + logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"] + ) ], "refusal": None, } + def _convert_text_completion_to_chat( completion: llama_types.Completion, ) -> llama_types.ChatCompletion: @@ -300,7 +306,9 @@ def _convert_text_completion_to_chat( "role": "assistant", "content": completion["choices"][0]["text"], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": completion["choices"][0]["finish_reason"], } ], @@ -344,7 +352,9 @@ def _convert_text_completion_chunks_to_chat( if chunk["choices"][0]["finish_reason"] is None else {} ), - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -407,7 +417,9 @@ def _convert_completion_to_chat_function( } ], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": "tool_calls", } ], @@ -460,7 +472,9 @@ def _stream_response_to_function_stream( { "index": 0, "finish_reason": None, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -497,7 +511,9 @@ def _stream_response_to_function_stream( { "index": 0, "finish_reason": None, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -598,6 +614,19 @@ def chat_completion_handler( add_bos=not result.added_special, special=True, ) + + # Is there a way to ensure this is not set for production? This will + # slow down things at least a little (latency) because I/O is slow. + if llama.formatted_prompt_path is not None: + output_path = pathlib.Path(llama.formatted_prompt_path) + + # We ensure that output path ends with .ndjson in pydantic validation. + lockfile_path = output_path.with_suffix(".lock") + with filelock.FileLock(str(lockfile_path)): + with output_path.open("a", encoding="utf-8") as f: + json.dump({"prompt": result.prompt, "prompt_tokens": prompt}, f) + f.write("\n") + if result.stop is not None: stop = [] if stop is None else [stop] if isinstance(stop, str) else stop rstop = result.stop if isinstance(result.stop, list) else [result.stop] @@ -695,7 +724,7 @@ def chat_completion_handler( def hf_autotokenizer_to_chat_formatter( - pretrained_model_name_or_path: Union[str, os.PathLike[str]] + pretrained_model_name_or_path: Union[str, os.PathLike[str]], ) -> ChatFormatter: # https://huggingface.co/docs/transformers/main/chat_templating # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format @@ -720,7 +749,7 @@ def format_autotokenizer( def hf_autotokenizer_to_chat_completion_handler( - pretrained_model_name_or_path: Union[str, os.PathLike[str]] + pretrained_model_name_or_path: Union[str, os.PathLike[str]], ) -> LlamaChatCompletionHandler: chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path) return chat_formatter_to_chat_completion_handler(chat_formatter) @@ -1790,7 +1819,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): } ], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": "tool_calls", } ], @@ -2202,7 +2233,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -2304,7 +2337,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": None, @@ -2342,7 +2377,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": buffer.pop(0), @@ -2365,7 +2402,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": ( @@ -2451,7 +2490,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -2685,7 +2726,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "message": { "role": "assistant", "content": None if content == "" else content, @@ -2795,9 +2838,7 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1): embed = self._llava_cpp.llava_image_embed_make_with_bytes( self.clip_ctx, n_threads_batch, - (ctypes.c_uint8 * len(image_bytes)).from_buffer( - bytearray(image_bytes) - ), + (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), len(image_bytes), ) self._last_image_embed = embed @@ -2869,7 +2910,6 @@ def __call__( if self.verbose: print(text, file=sys.stderr) - # Evaluate prompt llama.reset() llama._ctx.kv_cache_clear() @@ -2885,7 +2925,9 @@ def __call__( llama.eval(tokens) else: image_bytes = self.load_image(value) - embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) + embed = self._embed_image_bytes( + image_bytes, llama.context_params.n_threads_batch + ) if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" @@ -3404,7 +3446,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): "{% endif %}" "{% endif %}" "{% endfor %}" - "{% for content in message['content'] %}" "{% if content.type == 'text' %}" "{{ content.text }}" @@ -3817,7 +3858,9 @@ def chatml_function_calling( { "finish_reason": "tool_calls", "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "message": { "role": "assistant", "content": None, diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 905e2fd76..721a3832a 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -223,6 +223,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: import functools kwargs = {} + # Move this here so that works w/ llama_cpp.Llama.from_pretrained as + # well as 'normal' constructor. + kwargs["formatted_prompt_path"] = settings.formatted_prompt_path if settings.hf_model_repo_id is not None: create_fn = functools.partial( diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index c79b45d28..54cace0bc 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -188,6 +188,14 @@ class ModelSettings(BaseSettings): default=None, description="Type of the value cache quantization.", ) + + # Path to store formatted prompts as NDJSON + formatted_prompt_path: Optional[str] = Field( + default=None, + pattern=r".*\.ndjson$", + description="Output path to store formatted prompts as NDJSON.", + ) + # Misc verbose: bool = Field( default=True, description="Whether to print debug information." diff --git a/pyproject.toml b/pyproject.toml index ee86a6c8e..7601d51c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,11 +15,11 @@ dependencies = [ "diskcache>=5.6.1", "jinja2>=2.11.3", "PyTrie>=0.4.0", + "filelock>=3.18.0", ] -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -56,9 +56,11 @@ dev = [ "httpx>=0.24.1", "pandas>=2.2.1", "tqdm>=4.66.2", +] +pyinstaller = [ "pyinstaller>=6.11.1", ] -all = ["llama_cpp_python[server,test,dev]"] +all = ["llama_cpp_python[server,test,dev,pyinstaller]"] [tool.scikit-build] wheel.packages = ["llama_cpp"] diff --git a/tests/test_settings.py b/tests/test_settings.py new file mode 100644 index 000000000..bfe46209e --- /dev/null +++ b/tests/test_settings.py @@ -0,0 +1,24 @@ +import pytest + +from llama_cpp.server.settings import ModelSettings +from pydantic import ValidationError + +# Required to pass in model name +DUMMY_MODEL_NAME = "foo" + + +def test_formatted_prompt_path_default_none(): + m = ModelSettings(model=DUMMY_MODEL_NAME) + assert m.formatted_prompt_path is None + + +def test_validation_error_if_prompt_path_not_endswith_ndjson(): + with pytest.raises( + ValidationError, match=r"String should match pattern '.*\\.ndjson\$'" + ): + ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="invalid_path.txt") + + +def test_formatted_prompt_path_works_if_endswith_ndjson(): + m = ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="valid_path.ndjson") + assert m.formatted_prompt_path == "valid_path.ndjson"