riverzhou
diff --git a/‎CHANGELOG.md
Copy file name to clipboardExpand all lines: CHANGELOG.md
+7Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.md
Copy file name to clipboardExpand all lines: CHANGELOG.md
+7Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/api-reference.md
Copy file name to clipboardExpand all lines: docs/api-reference.md
+16Lines changed: 16 additions & 0 deletions b/‎docs/api-reference.md
Copy file name to clipboardExpand all lines: docs/api-reference.md
+16Lines changed: 16 additions & 0 deletions
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+49-46Lines changed: 49 additions & 46 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+49-46Lines changed: 49 additions & 46 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+27-5Lines changed: 27 additions & 5 deletions b/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+27-5Lines changed: 27 additions & 5 deletions
diff --git a/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+1-1Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Copy file name to clipboardExpand all lines: setup.py
+1-1Lines changed: 1 addition & 1 deletion b/‎setup.py
Copy file name to clipboardExpand all lines: setup.py
+1-1Lines changed: 1 addition & 1 deletion
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.67]
+
+## Fixed
+
+- Fix performance bug in Llama model by pre-allocating memory tokens and logits.
+- Fix bug in Llama model where the model was not free'd after use.
+
 ## [0.1.66]
 
 ## Added
 
@@ -32,6 +32,22 @@ title: API Reference
     options:
         show_root_heading: true
 
+::: llama_cpp.LogitsProcessor
+    options:
+        show_root_heading: true
+
+::: llama_cpp.LogitsProcessorList
+    options:
+        show_root_heading: true
+
+::: llama_cpp.StoppingCriteria
+    options:
+        show_root_heading: true
+
+::: llama_cpp.StoppingCriteriaList
+    options:
+        show_root_heading: true
+
 ::: llama_cpp.llama_cpp
     options:
         show_if_no_docstring: true
@@ -141,7 +141,7 @@ def __getitem__(self, key: Sequence[int]) -> "LlamaState":
         if _key is None:
             raise KeyError("Key not found")
         value: "LlamaState" = self.cache.pop(_key)  # type: ignore
-        # NOTE: This puts an integer as key in cache, which breaks, 
+        # NOTE: This puts an integer as key in cache, which breaks,
         # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
         # self.cache.push(_key, side="front")  # type: ignore
         return value
@@ -166,17 +166,15 @@ def __setitem__(self, key: Sequence[int], value: "LlamaState"):
 class LlamaState:
     def __init__(
         self,
-        eval_tokens: Deque[int],
-        eval_logits: Deque[List[float]],
         input_ids: npt.NDArray[np.intc],
         scores: npt.NDArray[np.single],
+        n_tokens: int,
         llama_state: bytes,
         llama_state_size: int,
     ):
-        self.eval_tokens = eval_tokens
-        self.eval_logits = eval_logits
         self.input_ids = input_ids
         self.scores = scores
+        self.n_tokens = n_tokens
         self.llama_state = llama_state
         self.llama_state_size = llama_state_size
 
@@ -267,8 +265,6 @@ def __init__(
 
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
-        self.eval_tokens: Deque[int] = deque(maxlen=n_ctx)
-        self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1)
 
         self.cache: Optional[BaseLlamaCache] = None
 
@@ -329,8 +325,30 @@ def __init__(
         self._token_nl = Llama.token_nl()
         self._token_eos = Llama.token_eos()
 
-        self._input_ids = np.array([], dtype=np.intc)
-        self._scores: npt.NDArray[np.single] = np.ndarray((0, self._n_vocab), dtype=np.single)
+        self.n_tokens = 0
+        self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
+        self.scores: npt.NDArray[np.single] = np.ndarray(
+            (n_ctx, self._n_vocab), dtype=np.single
+        )
+
+    @property
+    def _input_ids(self) -> npt.NDArray[np.intc]:
+        return self.input_ids[: self.n_tokens]
+
+    @property
+    def _scores(self) -> npt.NDArray[np.single]:
+        return self.scores[: self.n_tokens, :]
+
+    @property
+    def eval_tokens(self) -> Deque[int]:
+        return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx)
+
+    @property
+    def eval_logits(self) -> Deque[List[float]]:
+        return deque(
+            self.scores[: self.n_tokens, :].tolist(),
+            maxlen=self._n_ctx if self.params.logits_all else 1,
+        )
 
     def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         """Tokenize a string.
@@ -397,10 +415,7 @@ def set_cache(self, cache: Optional[BaseLlamaCache]):
 
     def reset(self):
         """Reset the model state."""
-        self.eval_tokens.clear()
-        self.eval_logits.clear()
-        self._input_ids = np.array([], dtype=np.intc)
-        self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
+        self.n_tokens = 0
 
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
@@ -410,7 +425,6 @@ def eval(self, tokens: Sequence[int]):
         """
         assert self.ctx is not None
         n_ctx = self._n_ctx
-        scores: List[npt.NDArray[np.single]] = []
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = min(n_ctx - len(batch), len(self._input_ids))
@@ -425,19 +439,14 @@ def eval(self, tokens: Sequence[int]):
             if return_code != 0:
                 raise RuntimeError(f"llama_eval returned {return_code}")
             # Save tokens
-            self.eval_tokens.extend(batch)
-            self._input_ids: npt.NDArray[np.intc] = np.concatenate(
-                (self._input_ids, np.array(batch, dtype=np.intc)), axis=0
-            )
+            self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
             # Save logits
             rows = n_tokens if self.params.logits_all else 1
-            n_vocab = self._n_vocab
-            cols = n_vocab
-            logits_view = llama_cpp.llama_get_logits(self.ctx)
-            logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
-            self.eval_logits.extend(logits)
-            scores.append(np.array(logits, dtype=np.single))
-        self._scores = np.concatenate(scores)
+            cols = self._n_vocab
+            offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False
+            self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols]
+            # Update n_tokens
+            self.n_tokens += n_tokens
 
     def _sample(
         self,
@@ -457,8 +466,7 @@ def _sample(
         logits_processor: Optional[LogitsProcessorList] = None,
     ):
         assert self.ctx is not None
-        assert len(self.eval_logits) > 0
-        assert self._scores.shape[0] > 0
+        assert self.n_tokens > 0
         n_vocab = self._n_vocab
         n_ctx = self._n_ctx
         top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
@@ -475,7 +483,6 @@ def _sample(
                 dtype=np.single,
             )
             self._scores[-1, :] = logits
-            self.eval_logits[-1] = logits.tolist()
 
         nl_logit = logits[self._token_nl]
         candidates = self._candidates
@@ -672,14 +679,7 @@ def generate(
                     print("Llama.generate: prefix-match hit", file=sys.stderr)
                 reset = False
                 tokens = tokens[longest_prefix:]
-                self._input_ids = self._input_ids[:longest_prefix]
-                self._scores = self._scores[:longest_prefix, :]
-                for _ in range(len(self.eval_tokens) - longest_prefix):
-                    self.eval_tokens.pop()
-                    try:
-                        self.eval_logits.pop()
-                    except IndexError:
-                        pass
+                self.n_tokens = longest_prefix
 
         if reset:
             self.reset()
@@ -819,7 +819,9 @@ def _create_completion(
             llama_cpp.llama_reset_timings(self.ctx)
 
         if len(prompt_tokens) > self._n_ctx:
-            raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}")
+            raise ValueError(
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
+            )
 
         # Truncate max_tokens if requested tokens would exceed the context window
         max_tokens = (
@@ -1437,6 +1439,9 @@ def create_chat_completion(
             return self._convert_text_completion_to_chat(completion)
 
     def __del__(self):
+        if self.model is not None:
+            llama_cpp.llama_free_model(self.model)
+            self.model = None
         if self.ctx is not None:
             llama_cpp.llama_free(self.ctx)
             self.ctx = None
@@ -1510,22 +1515,20 @@ def save_state(self) -> LlamaState:
                 file=sys.stderr,
             )
         return LlamaState(
-            eval_tokens=self.eval_tokens.copy(),
-            eval_logits=self.eval_logits.copy(),
-            scores=self._scores.copy(),
-            input_ids=self._input_ids.copy(),
+            scores=self.scores.copy(),
+            input_ids=self.input_ids.copy(),
+            n_tokens=self.n_tokens,
             llama_state=bytes(llama_state_compact),
             llama_state_size=n_bytes,
         )
 
     def load_state(self, state: LlamaState) -> None:
         assert self.ctx is not None
-        self.eval_tokens = state.eval_tokens.copy()
-        self.eval_logits = state.eval_logits.copy()
-        self._scores = state.scores.copy()
-        self._input_ids = state.input_ids.copy()
+        self.scores = state.scores.copy()
+        self.input_ids = state.input_ids.copy()
+        self.n_tokens = state.n_tokens
         state_size = state.llama_state_size
-        LLamaStateArrayType = (llama_cpp.c_uint8 * state_size)
+        LLamaStateArrayType = llama_cpp.c_uint8 * state_size
         llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
 
         if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size:
 
@@ -290,13 +290,14 @@ def llama_mlock_supported() -> bool:
 
 # // TODO: not great API - very likely to change
 # // Initialize the llama + ggml backend
+# // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_init_backend();
-def llama_init_backend():
-    return _lib.llama_init_backend()
+# LLAMA_API void llama_init_backend(bool numa);
+def llama_init_backend(numa: c_bool):
+    return _lib.llama_init_backend(numa)
 
 
-_lib.llama_init_backend.argtypes = []
+_lib.llama_init_backend.argtypes = [c_bool]
 _lib.llama_init_backend.restype = None
 
 
@@ -565,6 +566,27 @@ def llama_eval(
 _lib.llama_eval.restype = c_int
 
 
+# // Same as llama_eval, but use float matrix input directly.
+# LLAMA_API int llama_eval_embd(
+#         struct llama_context * ctx,
+#                     const float * embd,
+#                             int   n_tokens,
+#                             int   n_past,
+#                             int   n_threads);
+def llama_eval_embd(
+    ctx: llama_context_p,
+    embd,  # type: Array[c_float]
+    n_tokens: c_int,
+    n_past: c_int,
+    n_threads: c_int,
+) -> int:
+    return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
+
+
+_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int]
+_lib.llama_eval_embd.restype = c_int
+
+
 # Convert the provided text into tokens.
 # The tokens pointer must be large enough to hold the resulting tokens.
 # Returns the number of tokens on success, no more than n_max_tokens
@@ -998,5 +1020,5 @@ def llama_print_system_info() -> bytes:
 _llama_initialized = False
 
 if not _llama_initialized:
-    llama_init_backend()
+    llama_init_backend(c_bool(False))
     _llama_initialized = True
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.66"
+version = "0.1.67"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
 
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.66",
+    version="0.1.67",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",