richinseattle
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+11-2Lines changed: 11 additions & 2 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+11-2Lines changed: 11 additions & 2 deletions
@@ -329,6 +329,8 @@ def __init__(
             (n_ctx, self._n_vocab), dtype=np.single
         )
 
+        self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
@@ -516,7 +518,7 @@ def sample(
                 candidates=self._candidates,
                 tau=mirostat_tau,
                 eta=mirostat_eta,
-                mu=2.0 * mirostat_tau,
+                mu=ctypes.pointer(self._mirostat_mu),
                 m=100,
             )
         elif mirostat_mode == 2:
@@ -525,7 +527,7 @@ def sample(
                 candidates=self._candidates,
                 tau=mirostat_tau,
                 eta=mirostat_eta,
-                mu=2.0 * mirostat_tau,
+                mu=ctypes.pointer(self._mirostat_mu)
             )
         else:
             self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
@@ -581,6 +583,10 @@ def generate(
         Yields:
             The generated tokens.
         """
+        # Reset mirostat sampling
+        self._mirostat_mu = ctypes.c_float(2.0 * mirostat_tau)
+
+        # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
             for a, b in zip(self._input_ids, tokens[:-1]):
@@ -595,12 +601,15 @@ def generate(
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
 
+        # Reset the model state
         if reset:
             self.reset()
 
+        # Reset the grammar
         if grammar is not None:
             grammar.reset()
 
+        # Eval and sample
         while True:
             self.eval(tokens)
             token = self.sample(