Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9c68382

Browse filesBrowse files
committed
Merge branch 'main' into configurable-chat-templates
2 parents 428b64e + a72efc7 commit 9c68382
Copy full SHA for 9c68382

File tree

Expand file treeCollapse file tree

10 files changed

+808
-364
lines changed
Filter options
Expand file treeCollapse file tree

10 files changed

+808
-364
lines changed

‎.gitignore

Copy file name to clipboardExpand all lines: .gitignore
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
.python-version
2+
13
.vscode/
24

35
_skbuild/

‎CHANGELOG.md

Copy file name to clipboardExpand all lines: CHANGELOG.md
+7-1Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10-
- Update llama.cpp to 8781013ef654270cbead3e0011e33a6d690fb168
10+
## [0.2.7]
11+
12+
- Update llama.cpp to a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
13+
- Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
14+
- Add openai-processing-ms to server response header by @Tradunsky in #748
15+
- Bump minimum version of scikit-build-core to 0.5.1 to fix msvc cmake issue by @abetlen in 1ed0f3ebe16993a0f961155aa4b2c85f1c68f668
16+
- Update `llama_types.py` to better match the openai api, old names are aliased to new ones by @abetlen in dbca136feaaf7f8b1182c4c3c90c32918b1d0bb3
1117

1218
## [0.2.6]
1319

‎llama_cpp/__init__.py

Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.6"
4+
__version__ = "0.2.7"

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+115-79Lines changed: 115 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -216,30 +216,36 @@ def __init__(
216216
self,
217217
model_path: str,
218218
*,
219-
# NOTE: These parameters are likely to change in the future.
220-
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
221-
n_ctx: int = 512,
222-
n_batch: int = 512,
219+
# Model Params
223220
n_gpu_layers: int = 0,
224221
main_gpu: int = 0,
225222
tensor_split: Optional[List[float]] = None,
223+
vocab_only: bool = False,
224+
use_mmap: bool = True,
225+
use_mlock: bool = False,
226+
# Context Params
227+
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
228+
n_ctx: int = 512,
229+
n_batch: int = 512,
230+
n_threads: Optional[int] = None,
231+
n_threads_batch: Optional[int] = None,
226232
rope_freq_base: float = 10000.0,
227233
rope_freq_scale: float = 1.0,
228-
low_vram: bool = False,
229234
mul_mat_q: bool = True,
230235
f16_kv: bool = True,
231236
logits_all: bool = False,
232-
vocab_only: bool = False,
233-
use_mmap: bool = True,
234-
use_mlock: bool = False,
235237
embedding: bool = False,
236-
n_threads: Optional[int] = None,
238+
# Sampling Params
237239
last_n_tokens_size: int = 64,
240+
# LoRA Params
238241
lora_base: Optional[str] = None,
242+
lora_scale: float = 1.0,
239243
lora_path: Optional[str] = None,
244+
# Backend Params
240245
numa: bool = False,
241-
chat_completion_template: Optional["ChatCompletionFormat"] = None,
246+
# Misc
242247
verbose: bool = True,
248+
# Extra Params
243249
**kwargs, # type: ignore
244250
):
245251
"""Load a llama.cpp model from `model_path`.
@@ -279,79 +285,88 @@ def __init__(
279285

280286
self.verbose = verbose
281287

288+
self.numa = numa
282289
if not Llama.__backend_initialized:
283290
if self.verbose:
284-
llama_cpp.llama_backend_init(numa)
291+
llama_cpp.llama_backend_init(self.numa)
285292
else:
286293
with suppress_stdout_stderr():
287-
llama_cpp.llama_backend_init(numa)
294+
llama_cpp.llama_backend_init(self.numa)
288295
Llama.__backend_initialized = True
289296

290297
self.model_path = model_path
291298

292-
self.params = llama_cpp.llama_context_default_params()
293-
self.params.seed = seed
294-
self.params.n_ctx = n_ctx
295-
self.params.n_gpu_layers = (
299+
# Model Params
300+
self.model_params = llama_cpp.llama_model_default_params()
301+
self.model_params.n_gpu_layers = (
296302
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
297303
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
298-
self.params.main_gpu = main_gpu
299-
self.params.rope_freq_base = rope_freq_base
300-
self.params.rope_freq_scale = rope_freq_scale
301-
self.params.low_vram = low_vram
302-
self.params.mul_mat_q = mul_mat_q
303-
self.params.f16_kv = f16_kv
304-
self.params.logits_all = logits_all
305-
self.params.vocab_only = vocab_only
306-
self.params.use_mmap = use_mmap if lora_path is None else False
307-
self.params.use_mlock = use_mlock
308-
self.params.embedding = embedding
309-
304+
self.model_params.main_gpu = main_gpu
310305
self.tensor_split = tensor_split
311306
self._p_tensor_split = None
312-
313307
if self.tensor_split is not None:
314308
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
315309
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
316310
self._c_tensor_split = FloatArray(
317-
*tensor_split
311+
*tensor_split # type: ignore
318312
) # keep a reference to the array so it is not gc'd
319-
self.params.tensor_split = self._c_tensor_split
313+
self.model_params.tensor_split = self._c_tensor_split
314+
self.model_params.vocab_only = vocab_only
315+
self.model_params.use_mmap = use_mmap if lora_path is None else False
316+
self.model_params.use_mlock = use_mlock
317+
318+
self.n_batch = min(n_ctx, n_batch) # ???
319+
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
320+
self.n_threads_batch = n_threads_batch or max(
321+
multiprocessing.cpu_count() // 2, 1
322+
)
320323

324+
# Context Params
325+
self.context_params = llama_cpp.llama_context_default_params()
326+
self.context_params.seed = seed
327+
self.context_params.n_ctx = n_ctx
328+
self.context_params.n_batch = self.n_batch
329+
self.context_params.n_threads = self.n_threads
330+
self.context_params.n_threads_batch = self.n_threads_batch
331+
self.context_params.rope_freq_base = rope_freq_base
332+
self.context_params.rope_freq_scale = rope_freq_scale
333+
self.context_params.mul_mat_q = mul_mat_q
334+
self.context_params.f16_kv = f16_kv
335+
self.context_params.logits_all = logits_all
336+
self.context_params.embedding = embedding
337+
338+
# Sampling Params
321339
self.last_n_tokens_size = last_n_tokens_size
322-
self.n_batch = min(n_ctx, n_batch)
323340

324-
self.chat_completion_template = (
325-
chat_completion_template or DefaultChatCompletionFormat()
326-
)
327341

328342
self.cache: Optional[BaseLlamaCache] = None
329343

330-
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
331-
332344
self.lora_base = lora_base
345+
self.lora_scale = lora_scale
333346
self.lora_path = lora_path
334347

335348
if not os.path.exists(model_path):
336349
raise ValueError(f"Model path does not exist: {model_path}")
337350

338351
if verbose:
339352
self.model = llama_cpp.llama_load_model_from_file(
340-
self.model_path.encode("utf-8"), self.params
353+
self.model_path.encode("utf-8"), self.model_params
341354
)
342355
else:
343356
with suppress_stdout_stderr():
344357
self.model = llama_cpp.llama_load_model_from_file(
345-
self.model_path.encode("utf-8"), self.params
358+
self.model_path.encode("utf-8"), self.model_params
346359
)
347360
assert self.model is not None
348361

349362
if verbose:
350-
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
363+
self.ctx = llama_cpp.llama_new_context_with_model(
364+
self.model, self.context_params
365+
)
351366
else:
352367
with suppress_stdout_stderr():
353368
self.ctx = llama_cpp.llama_new_context_with_model(
354-
self.model, self.params
369+
self.model, self.context_params
355370
)
356371

357372
assert self.ctx is not None
@@ -360,6 +375,7 @@ def __init__(
360375
if llama_cpp.llama_model_apply_lora_from_file(
361376
self.model,
362377
self.lora_path.encode("utf-8"),
378+
self.lora_scale,
363379
self.lora_base.encode("utf-8")
364380
if self.lora_base is not None
365381
else llama_cpp.c_char_p(0),
@@ -416,7 +432,7 @@ def eval_tokens(self) -> Deque[int]:
416432
def eval_logits(self) -> Deque[List[float]]:
417433
return deque(
418434
self.scores[: self.n_tokens, :].tolist(),
419-
maxlen=self._n_ctx if self.params.logits_all else 1,
435+
maxlen=self._n_ctx if self.model_params.logits_all else 1,
420436
)
421437

422438
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
@@ -434,7 +450,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
434450
assert self.model is not None
435451
n_ctx = self._n_ctx
436452
tokens = (llama_cpp.llama_token * n_ctx)()
437-
n_tokens = llama_cpp.llama_tokenize_with_model(
453+
n_tokens = llama_cpp.llama_tokenize(
438454
self.model,
439455
text,
440456
len(text),
@@ -445,7 +461,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
445461
if n_tokens < 0:
446462
n_tokens = abs(n_tokens)
447463
tokens = (llama_cpp.llama_token * n_tokens)()
448-
n_tokens = llama_cpp.llama_tokenize_with_model(
464+
n_tokens = llama_cpp.llama_tokenize(
449465
self.model,
450466
text,
451467
len(text),
@@ -473,7 +489,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
473489
size = 32
474490
buffer = (ctypes.c_char * size)()
475491
for token in tokens:
476-
n = llama_cpp.llama_token_to_piece_with_model(
492+
n = llama_cpp.llama_token_to_piece(
477493
self.model, llama_cpp.llama_token(token), buffer, size
478494
)
479495
assert n <= size
@@ -513,17 +529,16 @@ def eval(self, tokens: Sequence[int]):
513529
tokens=(llama_cpp.llama_token * len(batch))(*batch),
514530
n_tokens=n_tokens,
515531
n_past=n_past,
516-
n_threads=self.n_threads,
517532
)
518533
if return_code != 0:
519534
raise RuntimeError(f"llama_eval returned {return_code}")
520535
# Save tokens
521536
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
522537
# Save logits
523-
rows = n_tokens if self.params.logits_all else 1
538+
rows = n_tokens if self.context_params.logits_all else 1
524539
cols = self._n_vocab
525540
offset = (
526-
0 if self.params.logits_all else n_tokens - 1
541+
0 if self.context_params.logits_all else n_tokens - 1
527542
) # NOTE: Only save the last token logits if logits_all is False
528543
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
529544
-1
@@ -807,7 +822,7 @@ def generate(
807822

808823
def create_embedding(
809824
self, input: Union[str, List[str]], model: Optional[str] = None
810-
) -> Embedding:
825+
) -> CreateEmbeddingResponse:
811826
"""Embed a string.
812827
813828
Args:
@@ -819,7 +834,7 @@ def create_embedding(
819834
assert self.ctx is not None
820835
model_name: str = model if model is not None else self.model_path
821836

822-
if self.params.embedding == False:
837+
if self.model_params.embedding == False:
823838
raise RuntimeError(
824839
"Llama model must be created with embedding=True to call this method"
825840
)
@@ -941,7 +956,7 @@ def _create_completion(
941956
else:
942957
stop_sequences = []
943958

944-
if logprobs is not None and self.params.logits_all is False:
959+
if logprobs is not None and self.model_params.logits_all is False:
945960
raise ValueError(
946961
"logprobs is not supported for models created with logits_all=False"
947962
)
@@ -1632,47 +1647,68 @@ def __del__(self):
16321647

16331648
def __getstate__(self):
16341649
return dict(
1635-
verbose=self.verbose,
16361650
model_path=self.model_path,
1637-
n_ctx=self.params.n_ctx,
1638-
n_gpu_layers=self.params.n_gpu_layers,
1639-
seed=self.params.seed,
1640-
f16_kv=self.params.f16_kv,
1641-
logits_all=self.params.logits_all,
1642-
vocab_only=self.params.vocab_only,
1643-
use_mmap=self.params.use_mmap,
1644-
use_mlock=self.params.use_mlock,
1645-
embedding=self.params.embedding,
1646-
low_vram=self.params.low_vram,
1647-
last_n_tokens_size=self.last_n_tokens_size,
1651+
# Model Params
1652+
n_gpu_layers=self.model_params.n_gpu_layers,
1653+
main_gpu=self.model_params.main_gpu,
1654+
tensor_split=self.tensor_split,
1655+
vocab_only=self.model_params.vocab_only,
1656+
use_mmap=self.model_params.use_mmap,
1657+
use_mlock=self.model_params.use_mlock,
1658+
# Context Params
1659+
seed=self.context_params.seed,
1660+
n_ctx=self.context_params.n_ctx,
16481661
n_batch=self.n_batch,
1649-
n_threads=self.n_threads,
1662+
n_threads=self.context_params.n_threads,
1663+
n_threads_batch=self.context_params.n_threads_batch,
1664+
rope_freq_base=self.context_params.rope_freq_base,
1665+
rope_freq_scale=self.context_params.rope_freq_scale,
1666+
mul_mat_q=self.context_params.mul_mat_q,
1667+
f16_kv=self.context_params.f16_kv,
1668+
logits_all=self.context_params.logits_all,
1669+
embedding=self.context_params.embedding,
1670+
# Sampling Params
1671+
last_n_tokens_size=self.last_n_tokens_size,
1672+
# LoRA Params
16501673
lora_base=self.lora_base,
1674+
lora_scale=self.lora_scale,
16511675
lora_path=self.lora_path,
1652-
tensor_split=self.tensor_split,
1653-
mul_mat_q=self.params.mul_mat_q,
1676+
# Backend Params
1677+
numa=self.numa,
1678+
# Misc
1679+
verbose=self.verbose,
16541680
)
16551681

16561682
def __setstate__(self, state):
16571683
self.__init__(
16581684
model_path=state["model_path"],
1659-
n_ctx=state["n_ctx"],
1685+
# Model Params
16601686
n_gpu_layers=state["n_gpu_layers"],
1661-
seed=state["seed"],
1662-
f16_kv=state["f16_kv"],
1663-
logits_all=state["logits_all"],
1687+
main_gpu=state["main_gpu"],
1688+
tensor_split=state["tensor_split"],
16641689
vocab_only=state["vocab_only"],
16651690
use_mmap=state["use_mmap"],
16661691
use_mlock=state["use_mlock"],
1667-
embedding=state["embedding"],
1668-
low_vram=state["low_vram"],
1669-
n_threads=state["n_threads"],
1692+
# Context Params
1693+
seed=state["seed"],
1694+
n_ctx=state["n_ctx"],
16701695
n_batch=state["n_batch"],
1696+
n_threads=state["n_threads"],
1697+
n_threads_batch=state["n_threads_batch"],
1698+
rope_freq_base=state["rope_freq_base"],
1699+
rope_freq_scale=state["rope_freq_scale"],
1700+
mul_mat_q=state["mul_mat_q"],
1701+
f16_kv=state["f16_kv"],
1702+
logits_all=state["logits_all"],
1703+
embedding=state["embedding"],
1704+
# Sampling Params
16711705
last_n_tokens_size=state["last_n_tokens_size"],
1706+
# LoRA Params
16721707
lora_base=state["lora_base"],
16731708
lora_path=state["lora_path"],
1674-
tensor_split=state["tensor_split"],
1675-
mul_mat_q=state["mul_mat_q"],
1709+
# Backend Params
1710+
numa=state["numa"],
1711+
# Misc
16761712
verbose=state["verbose"],
16771713
)
16781714

@@ -1725,13 +1761,13 @@ def n_ctx(self) -> int:
17251761

17261762
def n_embd(self) -> int:
17271763
"""Return the embedding size."""
1728-
assert self.ctx is not None
1729-
return llama_cpp.llama_n_embd(self.ctx)
1764+
assert self.model is not None
1765+
return llama_cpp.llama_n_embd(self.model)
17301766

17311767
def n_vocab(self) -> int:
17321768
"""Return the vocabulary size."""
1733-
assert self.ctx is not None
1734-
return llama_cpp.llama_n_vocab(self.ctx)
1769+
assert self.model is not None
1770+
return llama_cpp.llama_n_vocab(self.model)
17351771

17361772
def tokenizer(self) -> "LlamaTokenizer":
17371773
"""Return the tokenizer for this model."""

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.