Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

CLI Arg to Dump Formatted Prompt Into NDJSON #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions 5 Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,10 @@ deploy.pyinstaller.mac:

# This still builds with metal support (I think b/c GGML_NATIVE=ON). Not an
# issue since can still run Q4_0 models w/ repacking support on CPU if `-ngl 0`.
CMAKE_BUILD_TYPE="Release" \
CMAKE_ARGS="-DGGML_METAL=OFF -DGGML_LLAMAFILE=OFF -DGGML_BLAS=OFF \
-DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON \
-DCMAKE_BUILD_TYPE=Release" python3 -m pip install -v -e .[server,dev]
-DGGML_NATIVE=ON -DGGML_CPU_AARCH64=ON" \
python3 -m pip install -v -e .[server,pyinstaller]
@server_path=$$(python -c 'import llama_cpp.server; print(llama_cpp.server.__file__)' | sed s/init/main/) ; \
echo "Server path: $$server_path" ; \
base_path=$$(python -c 'from llama_cpp._ggml import libggml_base_path; print(str(libggml_base_path))') ; \
Expand Down
8 changes: 8 additions & 0 deletions 8 llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,12 @@ def free_lora_adapter():

self._sampler = None

# Created formatted prompt path, used for storing formatted prompts as NDJSON
if (formatted_prompt_path := kwargs.get("formatted_prompt_path")) is not None:
self.formatted_prompt_path = formatted_prompt_path
else:
self.formatted_prompt_path = None

@property
def ctx(self) -> llama_cpp.llama_context_p:
return self._ctx.ctx
Expand Down Expand Up @@ -2178,6 +2184,8 @@ def __getstate__(self):
# Misc
spm_infill=self.spm_infill,
verbose=self.verbose,
# Path provided for prompt serialization, if any
formatted_prompt_path=self.formatted_prompt_path,
)

def __setstate__(self, state):
Expand Down
87 changes: 65 additions & 22 deletions 87 llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import ctypes
import dataclasses
import random
import pathlib
import string

from contextlib import ExitStack
Expand All @@ -24,6 +25,7 @@

import jinja2
from jinja2.sandbox import ImmutableSandboxedEnvironment
import filelock

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -279,11 +281,15 @@ def _convert_text_completion_logprobs_to_chat(
}
for top_token, top_logprob in top_logprobs.items()
],
} for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"])
}
for (token, logprob, top_logprobs) in zip(
logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]
)
],
"refusal": None,
}


def _convert_text_completion_to_chat(
completion: llama_types.Completion,
) -> llama_types.ChatCompletion:
Expand All @@ -300,7 +306,9 @@ def _convert_text_completion_to_chat(
"role": "assistant",
"content": completion["choices"][0]["text"],
},
"logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
completion["choices"][0]["logprobs"]
),
"finish_reason": completion["choices"][0]["finish_reason"],
}
],
Expand Down Expand Up @@ -344,7 +352,9 @@ def _convert_text_completion_chunks_to_chat(
if chunk["choices"][0]["finish_reason"] is None
else {}
),
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"finish_reason": chunk["choices"][0]["finish_reason"],
}
],
Expand Down Expand Up @@ -407,7 +417,9 @@ def _convert_completion_to_chat_function(
}
],
},
"logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
completion["choices"][0]["logprobs"]
),
"finish_reason": "tool_calls",
}
],
Expand Down Expand Up @@ -460,7 +472,9 @@ def _stream_response_to_function_stream(
{
"index": 0,
"finish_reason": None,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": None,
"content": None,
Expand Down Expand Up @@ -497,7 +511,9 @@ def _stream_response_to_function_stream(
{
"index": 0,
"finish_reason": None,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": None,
"content": None,
Expand Down Expand Up @@ -598,6 +614,19 @@ def chat_completion_handler(
add_bos=not result.added_special,
special=True,
)

# Is there a way to ensure this is not set for production? This will
# slow down things at least a little (latency) because I/O is slow.
if llama.formatted_prompt_path is not None:
output_path = pathlib.Path(llama.formatted_prompt_path)

# We ensure that output path ends with .ndjson in pydantic validation.
lockfile_path = output_path.with_suffix(".lock")
with filelock.FileLock(str(lockfile_path)):
with output_path.open("a", encoding="utf-8") as f:
json.dump({"prompt": result.prompt, "prompt_tokens": prompt}, f)
f.write("\n")

if result.stop is not None:
stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
rstop = result.stop if isinstance(result.stop, list) else [result.stop]
Expand Down Expand Up @@ -695,7 +724,7 @@ def chat_completion_handler(


def hf_autotokenizer_to_chat_formatter(
pretrained_model_name_or_path: Union[str, os.PathLike[str]]
pretrained_model_name_or_path: Union[str, os.PathLike[str]],
) -> ChatFormatter:
# https://huggingface.co/docs/transformers/main/chat_templating
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
Expand All @@ -720,7 +749,7 @@ def format_autotokenizer(


def hf_autotokenizer_to_chat_completion_handler(
pretrained_model_name_or_path: Union[str, os.PathLike[str]]
pretrained_model_name_or_path: Union[str, os.PathLike[str]],
) -> LlamaChatCompletionHandler:
chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path)
return chat_formatter_to_chat_completion_handler(chat_formatter)
Expand Down Expand Up @@ -1790,7 +1819,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
}
],
},
"logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
completion["choices"][0]["logprobs"]
),
"finish_reason": "tool_calls",
}
],
Expand Down Expand Up @@ -2202,7 +2233,9 @@ def generate_streaming(tools, functions, function_call, prompt):
choices=[
{
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": None,
"content": None,
Expand Down Expand Up @@ -2304,7 +2337,9 @@ def generate_streaming(tools, functions, function_call, prompt):
choices=[
{
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": "assistant",
"content": None,
Expand Down Expand Up @@ -2342,7 +2377,9 @@ def generate_streaming(tools, functions, function_call, prompt):
choices=[
{
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": "assistant",
"content": buffer.pop(0),
Expand All @@ -2365,7 +2402,9 @@ def generate_streaming(tools, functions, function_call, prompt):
choices=[
{
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": "assistant",
"content": (
Expand Down Expand Up @@ -2451,7 +2490,9 @@ def generate_streaming(tools, functions, function_call, prompt):
choices=[
{
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
chunk["choices"][0]["logprobs"]
),
"delta": {
"role": None,
"content": None,
Expand Down Expand Up @@ -2685,7 +2726,9 @@ def generate_streaming(tools, functions, function_call, prompt):
choices=[
{
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
completion["choices"][0]["logprobs"]
),
"message": {
"role": "assistant",
"content": None if content == "" else content,
Expand Down Expand Up @@ -2795,9 +2838,7 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
embed = self._llava_cpp.llava_image_embed_make_with_bytes(
self.clip_ctx,
n_threads_batch,
(ctypes.c_uint8 * len(image_bytes)).from_buffer(
bytearray(image_bytes)
),
(ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
len(image_bytes),
)
self._last_image_embed = embed
Expand Down Expand Up @@ -2869,7 +2910,6 @@ def __call__(
if self.verbose:
print(text, file=sys.stderr)


# Evaluate prompt
llama.reset()
llama._ctx.kv_cache_clear()
Expand All @@ -2885,7 +2925,9 @@ def __call__(
llama.eval(tokens)
else:
image_bytes = self.load_image(value)
embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
embed = self._embed_image_bytes(
image_bytes, llama.context_params.n_threads_batch
)
if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
raise ValueError(
f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
Expand Down Expand Up @@ -3404,7 +3446,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
"{% endif %}"
"{% endif %}"
"{% endfor %}"

"{% for content in message['content'] %}"
"{% if content.type == 'text' %}"
"{{ content.text }}"
Expand Down Expand Up @@ -3817,7 +3858,9 @@ def chatml_function_calling(
{
"finish_reason": "tool_calls",
"index": 0,
"logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
"logprobs": _convert_text_completion_logprobs_to_chat(
completion["choices"][0]["logprobs"]
),
"message": {
"role": "assistant",
"content": None,
Expand Down
3 changes: 3 additions & 0 deletions 3 llama_cpp/server/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
import functools

kwargs = {}
# Move this here so that works w/ llama_cpp.Llama.from_pretrained as
# well as 'normal' constructor.
kwargs["formatted_prompt_path"] = settings.formatted_prompt_path

if settings.hf_model_repo_id is not None:
create_fn = functools.partial(
Expand Down
8 changes: 8 additions & 0 deletions 8 llama_cpp/server/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,14 @@ class ModelSettings(BaseSettings):
default=None,
description="Type of the value cache quantization.",
)

# Path to store formatted prompts as NDJSON
formatted_prompt_path: Optional[str] = Field(
default=None,
pattern=r".*\.ndjson$",
description="Output path to store formatted prompts as NDJSON.",
)

# Misc
verbose: bool = Field(
default=True, description="Whether to print debug information."
Expand Down
8 changes: 5 additions & 3 deletions 8 pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ dependencies = [
"diskcache>=5.6.1",
"jinja2>=2.11.3",
"PyTrie>=0.4.0",
"filelock>=3.18.0",
]
requires-python = ">=3.8"
requires-python = ">=3.9"
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
Expand Down Expand Up @@ -56,9 +56,11 @@ dev = [
"httpx>=0.24.1",
"pandas>=2.2.1",
"tqdm>=4.66.2",
]
pyinstaller = [
"pyinstaller>=6.11.1",
]
all = ["llama_cpp_python[server,test,dev]"]
all = ["llama_cpp_python[server,test,dev,pyinstaller]"]

[tool.scikit-build]
wheel.packages = ["llama_cpp"]
Expand Down
24 changes: 24 additions & 0 deletions 24 tests/test_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest

from llama_cpp.server.settings import ModelSettings
from pydantic import ValidationError

# Required to pass in model name
DUMMY_MODEL_NAME = "foo"


def test_formatted_prompt_path_default_none():
m = ModelSettings(model=DUMMY_MODEL_NAME)
assert m.formatted_prompt_path is None


def test_validation_error_if_prompt_path_not_endswith_ndjson():
with pytest.raises(
ValidationError, match=r"String should match pattern '.*\\.ndjson\$'"
):
ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="invalid_path.txt")


def test_formatted_prompt_path_works_if_endswith_ndjson():
m = ModelSettings(model=DUMMY_MODEL_NAME, formatted_prompt_path="valid_path.ndjson")
assert m.formatted_prompt_path == "valid_path.ndjson"
Morty Proxy This is a proxified and sanitized view of the page, visit original site.