Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 579f526

Browse filesBrowse files
author
Shouyi Wang
committed
Resolve merge conflicts
2 parents 9f21f54 + 6705f9b commit 579f526
Copy full SHA for 579f526

File tree

Expand file treeCollapse file tree

8 files changed

+130
-78
lines changed
Filter options
Expand file treeCollapse file tree

8 files changed

+130
-78
lines changed

‎CHANGELOG.md

Copy file name to clipboardExpand all lines: CHANGELOG.md
+19Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.71]
11+
12+
### Added
13+
14+
- (llama.cpp) Update llama.cpp
15+
16+
### Fixed
17+
18+
- (server) Fix several pydantic v2 migration bugs
19+
20+
## [0.1.70]
21+
22+
### Fixed
23+
24+
- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
25+
- (server) Fixed changed settings field names from pydantic v2 migration
26+
27+
## [0.1.69]
28+
1029
### Added
1130

1231
- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+6-10Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -833,19 +833,15 @@ def _create_completion(
833833
if self.verbose:
834834
llama_cpp.llama_reset_timings(self.ctx)
835835

836-
if max_tokens <= 0:
837-
# Unlimited, depending on n_ctx.
838-
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
839-
raise ValueError(
840-
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
841-
)
842-
else:
843-
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
844-
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
836+
if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
845837
raise ValueError(
846-
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
838+
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
847839
)
848840

841+
if max_tokens <= 0:
842+
# Unlimited, depending on n_ctx.
843+
max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
844+
849845
# Truncate max_tokens if requested tokens would exceed the context window
850846
max_tokens = (
851847
max_tokens

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+49-6Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,13 +326,23 @@ def llama_mlock_supported() -> bool:
326326
# // Initialize the llama + ggml backend
327327
# // If numa is true, use NUMA optimizations
328328
# // Call once at the start of the program
329-
# LLAMA_API void llama_init_backend(bool numa);
330-
def llama_init_backend(numa: c_bool):
331-
return _lib.llama_init_backend(numa)
329+
# LLAMA_API void llama_backend_init(bool numa);
330+
def llama_backend_init(numa: c_bool):
331+
return _lib.llama_backend_init(numa)
332332

333333

334-
_lib.llama_init_backend.argtypes = [c_bool]
335-
_lib.llama_init_backend.restype = None
334+
_lib.llama_backend_init.argtypes = [c_bool]
335+
_lib.llama_backend_init.restype = None
336+
337+
338+
# // Call once at the end of the program - currently only used for MPI
339+
# LLAMA_API void llama_backend_free();
340+
def llama_backend_free():
341+
return _lib.llama_backend_free()
342+
343+
344+
_lib.llama_backend_free.argtypes = []
345+
_lib.llama_backend_free.restype = None
336346

337347

338348
# LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -819,6 +829,39 @@ def llama_sample_frequency_and_presence_penalties(
819829
_lib.llama_sample_frequency_and_presence_penalties.restype = None
820830

821831

832+
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
833+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
834+
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
835+
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
836+
# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
837+
# LLAMA_API void llama_sample_classifier_free_guidance(
838+
# struct llama_context * ctx,
839+
# llama_token_data_array * candidates,
840+
# struct llama_context * guidance_ctx,
841+
# float scale,
842+
# float smooth_factor);
843+
def llama_sample_classifier_free_guidance(
844+
ctx: llama_context_p,
845+
candidates, # type: _Pointer[llama_token_data_array]
846+
guidance_ctx: llama_context_p,
847+
scale: c_float,
848+
smooth_factor: c_float,
849+
):
850+
return _lib.llama_sample_classifier_free_guidance(
851+
ctx, candidates, guidance_ctx, scale, smooth_factor
852+
)
853+
854+
855+
_lib.llama_sample_classifier_free_guidance.argtypes = [
856+
llama_context_p,
857+
llama_token_data_array_p,
858+
llama_context_p,
859+
c_float,
860+
c_float,
861+
]
862+
_lib.llama_sample_classifier_free_guidance.restype = None
863+
864+
822865
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
823866
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
824867
def llama_sample_softmax(
@@ -1063,5 +1106,5 @@ def llama_print_system_info() -> bytes:
10631106
_llama_initialized = False
10641107

10651108
if not _llama_initialized:
1066-
llama_init_backend(c_bool(False))
1109+
llama_backend_init(c_bool(False))
10671110
_llama_initialized = True

‎llama_cpp/server/__main__.py

Copy file name to clipboardExpand all lines: llama_cpp/server/__main__.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@
3030

3131
if __name__ == "__main__":
3232
parser = argparse.ArgumentParser()
33-
for name, field in Settings.__model_fields__.items():
34-
description = field.field_info.description
33+
for name, field in Settings.model_fields.items():
34+
description = field.description
3535
if field.default is not None and description is not None:
3636
description += f" (default: {field.default})"
3737
parser.add_argument(
3838
f"--{name}",
3939
dest=name,
40-
type=field.type_,
40+
type=field.annotation if field.annotation is not None else str,
4141
help=description,
4242
)
4343

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+49-55Lines changed: 49 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,8 @@ class Settings(BaseSettings):
8484
verbose: bool = Field(
8585
default=True, description="Whether to print debug information."
8686
)
87-
host: str = Field(
88-
default="localhost", description="Listen address"
89-
)
90-
port: int = Field(
91-
default=8000, description="Listen port"
92-
)
87+
host: str = Field(default="localhost", description="Listen address")
88+
port: int = Field(default=8000, description="Listen port")
9389
interrupt_requests: bool = Field(
9490
default=True,
9591
description="Whether to interrupt requests when a new request is received.",
@@ -183,7 +179,7 @@ def get_settings():
183179
yield settings
184180

185181

186-
model_field = Field(description="The model to use for generating completions.")
182+
model_field = Field(description="The model to use for generating completions.", default=None)
187183

188184
max_tokens_field = Field(
189185
default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
@@ -247,21 +243,18 @@ def get_settings():
247243
default=0,
248244
ge=0,
249245
le=2,
250-
description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
246+
description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
251247
)
252248

253249
mirostat_tau_field = Field(
254250
default=5.0,
255251
ge=0.0,
256252
le=10.0,
257-
description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
253+
description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
258254
)
259255

260256
mirostat_eta_field = Field(
261-
default=0.1,
262-
ge=0.001,
263-
le=1.0,
264-
description="Mirostat learning rate"
257+
default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
265258
)
266259

267260

@@ -299,22 +292,23 @@ class CreateCompletionRequest(BaseModel):
299292
model: Optional[str] = model_field
300293
n: Optional[int] = 1
301294
best_of: Optional[int] = 1
302-
user: Optional[str] = Field(None)
295+
user: Optional[str] = Field(default=None)
303296

304297
# llama.cpp specific parameters
305298
top_k: int = top_k_field
306299
repeat_penalty: float = repeat_penalty_field
307300
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
308301

309-
class Config:
310-
schema_extra = {
311-
"example": {
312-
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
313-
"stop": ["\n", "###"],
314-
}
302+
model_config = {
303+
"json_schema_extra": {
304+
"examples": [
305+
{
306+
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
307+
"stop": ["\n", "###"],
308+
}
309+
]
315310
}
316-
317-
311+
}
318312

319313

320314
def make_logit_bias_processor(
@@ -333,7 +327,7 @@ def make_logit_bias_processor(
333327

334328
elif logit_bias_type == "tokens":
335329
for token, score in logit_bias.items():
336-
token = token.encode('utf-8')
330+
token = token.encode("utf-8")
337331
for input_id in llama.tokenize(token, add_bos=False):
338332
to_bias[input_id] = score
339333

@@ -357,7 +351,7 @@ async def create_completion(
357351
request: Request,
358352
body: CreateCompletionRequest,
359353
llama: llama_cpp.Llama = Depends(get_llama),
360-
):
354+
) -> llama_cpp.Completion:
361355
if isinstance(body.prompt, list):
362356
assert len(body.prompt) <= 1
363357
body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@@ -369,7 +363,7 @@ async def create_completion(
369363
"logit_bias_type",
370364
"user",
371365
}
372-
kwargs = body.dict(exclude=exclude)
366+
kwargs = body.model_dump(exclude=exclude)
373367

374368
if body.logit_bias is not None:
375369
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -401,7 +395,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
401395

402396
return EventSourceResponse(
403397
recv_chan, data_sender_callable=partial(event_publisher, send_chan)
404-
)
398+
) # type: ignore
405399
else:
406400
completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore
407401
return completion
@@ -410,16 +404,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
410404
class CreateEmbeddingRequest(BaseModel):
411405
model: Optional[str] = model_field
412406
input: Union[str, List[str]] = Field(description="The input to embed.")
413-
user: Optional[str]
414-
415-
class Config:
416-
schema_extra = {
417-
"example": {
418-
"input": "The food was delicious and the waiter...",
419-
}
407+
user: Optional[str] = Field(default=None)
408+
409+
model_config = {
410+
"json_schema_extra": {
411+
"examples": [
412+
{
413+
"input": "The food was delicious and the waiter...",
414+
}
415+
]
420416
}
421-
422-
417+
}
423418

424419

425420
@router.post(
@@ -429,7 +424,7 @@ async def create_embedding(
429424
request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
430425
):
431426
return await run_in_threadpool(
432-
llama.create_embedding, **request.dict(exclude={"user"})
427+
llama.create_embedding, **request.model_dump(exclude={"user"})
433428
)
434429

435430

@@ -466,21 +461,22 @@ class CreateChatCompletionRequest(BaseModel):
466461
repeat_penalty: float = repeat_penalty_field
467462
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
468463

469-
class Config:
470-
schema_extra = {
471-
"example": {
472-
"messages": [
473-
ChatCompletionRequestMessage(
474-
role="system", content="You are a helpful assistant."
475-
),
476-
ChatCompletionRequestMessage(
477-
role="user", content="What is the capital of France?"
478-
),
479-
]
480-
}
464+
model_config = {
465+
"json_schema_extra": {
466+
"examples": [
467+
{
468+
"messages": [
469+
ChatCompletionRequestMessage(
470+
role="system", content="You are a helpful assistant."
471+
).model_dump(),
472+
ChatCompletionRequestMessage(
473+
role="user", content="What is the capital of France?"
474+
).model_dump(),
475+
]
476+
}
477+
]
481478
}
482-
483-
479+
}
484480

485481

486482
@router.post(
@@ -491,14 +487,14 @@ async def create_chat_completion(
491487
body: CreateChatCompletionRequest,
492488
llama: llama_cpp.Llama = Depends(get_llama),
493489
settings: Settings = Depends(get_settings),
494-
) -> Union[llama_cpp.ChatCompletion]: # type: ignore
490+
) -> llama_cpp.ChatCompletion:
495491
exclude = {
496492
"n",
497493
"logit_bias",
498494
"logit_bias_type",
499495
"user",
500496
}
501-
kwargs = body.dict(exclude=exclude)
497+
kwargs = body.model_dump(exclude=exclude)
502498

503499
if body.logit_bias is not None:
504500
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -531,7 +527,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
531527
return EventSourceResponse(
532528
recv_chan,
533529
data_sender_callable=partial(event_publisher, send_chan),
534-
)
530+
) # type: ignore
535531
else:
536532
completion: llama_cpp.ChatCompletion = await run_in_threadpool(
537533
llama.create_chat_completion, **kwargs # type: ignore
@@ -551,8 +547,6 @@ class ModelList(TypedDict):
551547
data: List[ModelData]
552548

553549

554-
555-
556550
@router.get("/v1/models")
557551
async def get_models(
558552
settings: Settings = Depends(get_settings),

‎pyproject.toml

Copy file name to clipboardExpand all lines: pyproject.toml
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "llama_cpp_python"
3-
version = "0.1.68"
3+
version = "0.1.71"
44
description = "Python bindings for the llama.cpp library"
55
authors = ["Andrei Betlen <abetlen@gmail.com>"]
66
license = "MIT"

‎setup.py

Copy file name to clipboardExpand all lines: setup.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
description="A Python wrapper for llama.cpp",
1111
long_description=long_description,
1212
long_description_content_type="text/markdown",
13-
version="0.1.68",
13+
version="0.1.71",
1414
author="Andrei Betlen",
1515
author_email="abetlen@gmail.com",
1616
license="MIT",
1717
package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
1818
packages=["llama_cpp", "llama_cpp.server"],
1919
install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
2020
extras_require={
21-
"server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
21+
"server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
2222
},
2323
python_requires=">=3.7",
2424
classifiers=[

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.