Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 874eea9

Browse filesBrowse files
committed
Merge remote-tracking branch 'origin/main' into riverlog
2 parents a965112 + c67f786 commit 874eea9
Copy full SHA for 874eea9

File tree

Expand file treeCollapse file tree

6 files changed

+101
-53
lines changed
Filter options
Expand file treeCollapse file tree

6 files changed

+101
-53
lines changed

‎CHANGELOG.md

Copy file name to clipboardExpand all lines: CHANGELOG.md
+7Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.67]
11+
12+
## Fixed
13+
14+
- Fix performance bug in Llama model by pre-allocating memory tokens and logits.
15+
- Fix bug in Llama model where the model was not free'd after use.
16+
1017
## [0.1.66]
1118

1219
## Added

‎docs/api-reference.md

Copy file name to clipboardExpand all lines: docs/api-reference.md
+16Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,22 @@ title: API Reference
3232
options:
3333
show_root_heading: true
3434

35+
::: llama_cpp.LogitsProcessor
36+
options:
37+
show_root_heading: true
38+
39+
::: llama_cpp.LogitsProcessorList
40+
options:
41+
show_root_heading: true
42+
43+
::: llama_cpp.StoppingCriteria
44+
options:
45+
show_root_heading: true
46+
47+
::: llama_cpp.StoppingCriteriaList
48+
options:
49+
show_root_heading: true
50+
3551
::: llama_cpp.llama_cpp
3652
options:
3753
show_if_no_docstring: true

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+49-46Lines changed: 49 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def __getitem__(self, key: Sequence[int]) -> "LlamaState":
141141
if _key is None:
142142
raise KeyError("Key not found")
143143
value: "LlamaState" = self.cache.pop(_key) # type: ignore
144-
# NOTE: This puts an integer as key in cache, which breaks,
144+
# NOTE: This puts an integer as key in cache, which breaks,
145145
# Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
146146
# self.cache.push(_key, side="front") # type: ignore
147147
return value
@@ -166,17 +166,15 @@ def __setitem__(self, key: Sequence[int], value: "LlamaState"):
166166
class LlamaState:
167167
def __init__(
168168
self,
169-
eval_tokens: Deque[int],
170-
eval_logits: Deque[List[float]],
171169
input_ids: npt.NDArray[np.intc],
172170
scores: npt.NDArray[np.single],
171+
n_tokens: int,
173172
llama_state: bytes,
174173
llama_state_size: int,
175174
):
176-
self.eval_tokens = eval_tokens
177-
self.eval_logits = eval_logits
178175
self.input_ids = input_ids
179176
self.scores = scores
177+
self.n_tokens = n_tokens
180178
self.llama_state = llama_state
181179
self.llama_state_size = llama_state_size
182180

@@ -267,8 +265,6 @@ def __init__(
267265

268266
self.last_n_tokens_size = last_n_tokens_size
269267
self.n_batch = min(n_ctx, n_batch)
270-
self.eval_tokens: Deque[int] = deque(maxlen=n_ctx)
271-
self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1)
272268

273269
self.cache: Optional[BaseLlamaCache] = None
274270

@@ -329,8 +325,30 @@ def __init__(
329325
self._token_nl = Llama.token_nl()
330326
self._token_eos = Llama.token_eos()
331327

332-
self._input_ids = np.array([], dtype=np.intc)
333-
self._scores: npt.NDArray[np.single] = np.ndarray((0, self._n_vocab), dtype=np.single)
328+
self.n_tokens = 0
329+
self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
330+
self.scores: npt.NDArray[np.single] = np.ndarray(
331+
(n_ctx, self._n_vocab), dtype=np.single
332+
)
333+
334+
@property
335+
def _input_ids(self) -> npt.NDArray[np.intc]:
336+
return self.input_ids[: self.n_tokens]
337+
338+
@property
339+
def _scores(self) -> npt.NDArray[np.single]:
340+
return self.scores[: self.n_tokens, :]
341+
342+
@property
343+
def eval_tokens(self) -> Deque[int]:
344+
return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx)
345+
346+
@property
347+
def eval_logits(self) -> Deque[List[float]]:
348+
return deque(
349+
self.scores[: self.n_tokens, :].tolist(),
350+
maxlen=self._n_ctx if self.params.logits_all else 1,
351+
)
334352

335353
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
336354
"""Tokenize a string.
@@ -397,10 +415,7 @@ def set_cache(self, cache: Optional[BaseLlamaCache]):
397415

398416
def reset(self):
399417
"""Reset the model state."""
400-
self.eval_tokens.clear()
401-
self.eval_logits.clear()
402-
self._input_ids = np.array([], dtype=np.intc)
403-
self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
418+
self.n_tokens = 0
404419

405420
def eval(self, tokens: Sequence[int]):
406421
"""Evaluate a list of tokens.
@@ -410,7 +425,6 @@ def eval(self, tokens: Sequence[int]):
410425
"""
411426
assert self.ctx is not None
412427
n_ctx = self._n_ctx
413-
scores: List[npt.NDArray[np.single]] = []
414428
for i in range(0, len(tokens), self.n_batch):
415429
batch = tokens[i : min(len(tokens), i + self.n_batch)]
416430
n_past = min(n_ctx - len(batch), len(self._input_ids))
@@ -425,19 +439,14 @@ def eval(self, tokens: Sequence[int]):
425439
if return_code != 0:
426440
raise RuntimeError(f"llama_eval returned {return_code}")
427441
# Save tokens
428-
self.eval_tokens.extend(batch)
429-
self._input_ids: npt.NDArray[np.intc] = np.concatenate(
430-
(self._input_ids, np.array(batch, dtype=np.intc)), axis=0
431-
)
442+
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
432443
# Save logits
433444
rows = n_tokens if self.params.logits_all else 1
434-
n_vocab = self._n_vocab
435-
cols = n_vocab
436-
logits_view = llama_cpp.llama_get_logits(self.ctx)
437-
logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
438-
self.eval_logits.extend(logits)
439-
scores.append(np.array(logits, dtype=np.single))
440-
self._scores = np.concatenate(scores)
445+
cols = self._n_vocab
446+
offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False
447+
self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols]
448+
# Update n_tokens
449+
self.n_tokens += n_tokens
441450

442451
def _sample(
443452
self,
@@ -457,8 +466,7 @@ def _sample(
457466
logits_processor: Optional[LogitsProcessorList] = None,
458467
):
459468
assert self.ctx is not None
460-
assert len(self.eval_logits) > 0
461-
assert self._scores.shape[0] > 0
469+
assert self.n_tokens > 0
462470
n_vocab = self._n_vocab
463471
n_ctx = self._n_ctx
464472
top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
@@ -475,7 +483,6 @@ def _sample(
475483
dtype=np.single,
476484
)
477485
self._scores[-1, :] = logits
478-
self.eval_logits[-1] = logits.tolist()
479486

480487
nl_logit = logits[self._token_nl]
481488
candidates = self._candidates
@@ -672,14 +679,7 @@ def generate(
672679
print("Llama.generate: prefix-match hit", file=sys.stderr)
673680
reset = False
674681
tokens = tokens[longest_prefix:]
675-
self._input_ids = self._input_ids[:longest_prefix]
676-
self._scores = self._scores[:longest_prefix, :]
677-
for _ in range(len(self.eval_tokens) - longest_prefix):
678-
self.eval_tokens.pop()
679-
try:
680-
self.eval_logits.pop()
681-
except IndexError:
682-
pass
682+
self.n_tokens = longest_prefix
683683

684684
if reset:
685685
self.reset()
@@ -819,7 +819,9 @@ def _create_completion(
819819
llama_cpp.llama_reset_timings(self.ctx)
820820

821821
if len(prompt_tokens) > self._n_ctx:
822-
raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}")
822+
raise ValueError(
823+
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
824+
)
823825

824826
# Truncate max_tokens if requested tokens would exceed the context window
825827
max_tokens = (
@@ -1437,6 +1439,9 @@ def create_chat_completion(
14371439
return self._convert_text_completion_to_chat(completion)
14381440

14391441
def __del__(self):
1442+
if self.model is not None:
1443+
llama_cpp.llama_free_model(self.model)
1444+
self.model = None
14401445
if self.ctx is not None:
14411446
llama_cpp.llama_free(self.ctx)
14421447
self.ctx = None
@@ -1510,22 +1515,20 @@ def save_state(self) -> LlamaState:
15101515
file=sys.stderr,
15111516
)
15121517
return LlamaState(
1513-
eval_tokens=self.eval_tokens.copy(),
1514-
eval_logits=self.eval_logits.copy(),
1515-
scores=self._scores.copy(),
1516-
input_ids=self._input_ids.copy(),
1518+
scores=self.scores.copy(),
1519+
input_ids=self.input_ids.copy(),
1520+
n_tokens=self.n_tokens,
15171521
llama_state=bytes(llama_state_compact),
15181522
llama_state_size=n_bytes,
15191523
)
15201524

15211525
def load_state(self, state: LlamaState) -> None:
15221526
assert self.ctx is not None
1523-
self.eval_tokens = state.eval_tokens.copy()
1524-
self.eval_logits = state.eval_logits.copy()
1525-
self._scores = state.scores.copy()
1526-
self._input_ids = state.input_ids.copy()
1527+
self.scores = state.scores.copy()
1528+
self.input_ids = state.input_ids.copy()
1529+
self.n_tokens = state.n_tokens
15271530
state_size = state.llama_state_size
1528-
LLamaStateArrayType = (llama_cpp.c_uint8 * state_size)
1531+
LLamaStateArrayType = llama_cpp.c_uint8 * state_size
15291532
llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
15301533

15311534
if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size:

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+27-5Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -290,13 +290,14 @@ def llama_mlock_supported() -> bool:
290290

291291
# // TODO: not great API - very likely to change
292292
# // Initialize the llama + ggml backend
293+
# // If numa is true, use NUMA optimizations
293294
# // Call once at the start of the program
294-
# LLAMA_API void llama_init_backend();
295-
def llama_init_backend():
296-
return _lib.llama_init_backend()
295+
# LLAMA_API void llama_init_backend(bool numa);
296+
def llama_init_backend(numa: c_bool):
297+
return _lib.llama_init_backend(numa)
297298

298299

299-
_lib.llama_init_backend.argtypes = []
300+
_lib.llama_init_backend.argtypes = [c_bool]
300301
_lib.llama_init_backend.restype = None
301302

302303

@@ -565,6 +566,27 @@ def llama_eval(
565566
_lib.llama_eval.restype = c_int
566567

567568

569+
# // Same as llama_eval, but use float matrix input directly.
570+
# LLAMA_API int llama_eval_embd(
571+
# struct llama_context * ctx,
572+
# const float * embd,
573+
# int n_tokens,
574+
# int n_past,
575+
# int n_threads);
576+
def llama_eval_embd(
577+
ctx: llama_context_p,
578+
embd, # type: Array[c_float]
579+
n_tokens: c_int,
580+
n_past: c_int,
581+
n_threads: c_int,
582+
) -> int:
583+
return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
584+
585+
586+
_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int]
587+
_lib.llama_eval_embd.restype = c_int
588+
589+
568590
# Convert the provided text into tokens.
569591
# The tokens pointer must be large enough to hold the resulting tokens.
570592
# Returns the number of tokens on success, no more than n_max_tokens
@@ -998,5 +1020,5 @@ def llama_print_system_info() -> bytes:
9981020
_llama_initialized = False
9991021

10001022
if not _llama_initialized:
1001-
llama_init_backend()
1023+
llama_init_backend(c_bool(False))
10021024
_llama_initialized = True

‎pyproject.toml

Copy file name to clipboardExpand all lines: pyproject.toml
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "llama_cpp_python"
3-
version = "0.1.66"
3+
version = "0.1.67"
44
description = "Python bindings for the llama.cpp library"
55
authors = ["Andrei Betlen <abetlen@gmail.com>"]
66
license = "MIT"

‎setup.py

Copy file name to clipboardExpand all lines: setup.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
description="A Python wrapper for llama.cpp",
1111
long_description=long_description,
1212
long_description_content_type="text/markdown",
13-
version="0.1.66",
13+
version="0.1.67",
1414
author="Andrei Betlen",
1515
author_email="abetlen@gmail.com",
1616
license="MIT",

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.