yieldthought
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+12-6Lines changed: 12 additions & 6 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+12-6Lines changed: 12 additions & 6 deletions
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+32-23Lines changed: 32 additions & 23 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+32-23Lines changed: 32 additions & 23 deletions
diff --git a/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+1-1Lines changed: 1 addition & 1 deletion b/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+1-1Lines changed: 1 addition & 1 deletion
@@ -213,7 +213,7 @@ class _LlamaModel:
 
     NOTE: For stability it's recommended you use the Llama class instead."""
 
-    _llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
+    _llama_free_model = None
 
     def __init__(
         self,
@@ -226,6 +226,8 @@ def __init__(
         self.params = params
         self.verbose = verbose
 
+        self._llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
+
         if not os.path.exists(path_model):
             raise ValueError(f"Model path does not exist: {path_model}")
 
@@ -236,7 +238,7 @@ def __init__(
 
     def __del__(self):
         with suppress_stdout_stderr(disable=self.verbose):
-            if self.model is not None:
+            if self.model is not None and self._llama_free_model is not None:
                 self._llama_free_model(self.model)
                 self.model = None
 
@@ -396,7 +398,7 @@ class _LlamaContext:
 
     NOTE: For stability it's recommended you use the Llama class instead."""
 
-    _llama_free = llama_cpp._lib.llama_free  # type: ignore
+    _llama_free = None
 
     def __init__(
         self,
@@ -409,14 +411,16 @@ def __init__(
         self.params = params
         self.verbose = verbose
 
+        self._llama_free = llama_cpp._lib.llama_free  # type: ignore
+
         with suppress_stdout_stderr(disable=self.verbose):
             self.ctx = llama_cpp.llama_new_context_with_model(
                 self.model.model, self.params
             )
 
     def __del__(self):
         with suppress_stdout_stderr(disable=self.verbose):
-            if self.ctx is not None:
+            if self.ctx is not None and self._llama_free is not None:
                 self._llama_free(self.ctx)
                 self.ctx = None
 
@@ -645,7 +649,7 @@ def default_params():
 
 
 class _LlamaBatch:
-    _llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
+    _llama_batch_free = None
 
     def __init__(
         self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
@@ -655,14 +659,16 @@ def __init__(
         self.n_seq_max = n_seq_max
         self.verbose = verbose
 
+        self._llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
+
         with suppress_stdout_stderr(disable=self.verbose):
             self.batch = llama_cpp.llama_batch_init(
                 self.n_tokens, self.embd, self.n_seq_max
             )
 
     def __del__(self):
         with suppress_stdout_stderr(disable=self.verbose):
-            if self.batch is not None:
+            if self.batch is not None and self._llama_batch_free is not None:
                 self._llama_batch_free(self.batch)
                 self.batch = None
 
 
@@ -9,6 +9,8 @@
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama_grammar as llama_grammar
 
+from ._utils import suppress_stdout_stderr
+
 
 class LlamaChatCompletionHandler(Protocol):
     def __call__(
@@ -775,20 +777,26 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
 
 
 class Llava15ChatHandler:
-    def __init__(self, clip_model_path: str):
+    _clip_free = None
+
+    def __init__(self, clip_model_path: str, verbose: bool = False):
         import llama_cpp.llava_cpp as llava_cpp
 
         self._llava_cpp = llava_cpp
         self.clip_model_path = clip_model_path
+        self.verbose = verbose
+        self._clip_free = self._llava_cpp._libllava.clip_free # type: ignore
 
-        self.clip_ctx = self._llava_cpp.clip_model_load(
-            self.clip_model_path.encode(), 0
-        )
+        with suppress_stdout_stderr(disable=self.verbose):
+            self.clip_ctx = self._llava_cpp.clip_model_load(
+                self.clip_model_path.encode(), 0 
+            )
 
     def __del__(self):
-        if self.clip_ctx is not None:
-            self._llava_cpp.clip_free(self.clip_ctx)
-            self.clip_ctx = None
+        with suppress_stdout_stderr(disable=self.verbose):
+            if self.clip_ctx is not None and self._clip_free is not None:
+                self._clip_free(self.clip_ctx)
+                self.clip_ctx = None
 
     def load_image(self, image_url: str) -> bytes:
         if image_url.startswith("data:"):
@@ -881,27 +889,28 @@ def __call__(
                             c_ubyte_ptr = (
                                 ctypes.c_ubyte * len(data_array)
                             ).from_buffer(data_array)
-                            embed = self._llava_cpp.llava_image_embed_make_with_bytes(
-                                ctx_clip=self.clip_ctx,
-                                n_threads=llama.context_params.n_threads,
-                                image_bytes=c_ubyte_ptr,
-                                image_bytes_length=len(image_bytes),
-                            )
-                            # image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes)
-                            # embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes))
+                            with suppress_stdout_stderr(disable=self.verbose):
+                                embed = self._llava_cpp.llava_image_embed_make_with_bytes(
+                                    ctx_clip=self.clip_ctx,
+                                    n_threads=llama.context_params.n_threads,
+                                    image_bytes=c_ubyte_ptr,
+                                    image_bytes_length=len(image_bytes),
+                                )
                             try:
                                 n_past = ctypes.c_int(llama.n_tokens)
                                 n_past_p = ctypes.pointer(n_past)
-                                self._llava_cpp.llava_eval_image_embed(
-                                    ctx_llama=llama.ctx,
-                                    embed=embed,
-                                    n_batch=llama.n_batch,
-                                    n_past=n_past_p,
-                                )
+                                with suppress_stdout_stderr(disable=self.verbose):
+                                    self._llava_cpp.llava_eval_image_embed(
+                                        ctx_llama=llama.ctx,
+                                        embed=embed,
+                                        n_batch=llama.n_batch,
+                                        n_past=n_past_p,
+                                    )
                                 assert llama.n_ctx() >= n_past.value
                                 llama.n_tokens = n_past.value
                             finally:
-                                self._llava_cpp.llava_image_embed_free(embed)
+                                with suppress_stdout_stderr(disable=self.verbose):
+                                    self._llava_cpp.llava_image_embed_free(embed)
             if message["role"] == "assistant" and message["content"] is not None:
                 llama.eval(
                     llama.tokenize(
@@ -910,7 +919,7 @@ def __call__(
                 )
         llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
 
-        prompt = llama._input_ids.tolist()
+        prompt = llama.input_ids[:llama.n_tokens].tolist()
 
         return _convert_completion_to_chat(
             llama.create_completion(
 
@@ -384,7 +384,7 @@ def create_app(settings: Optional[Settings] = None):
     chat_handler = None
     if settings.chat_format == "llava-1-5":
         assert settings.clip_model_path is not None
-        chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path)
+        chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose)
     ##
 
     llama = llama_cpp.Llama(