noticeable
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+11-8Lines changed: 11 additions & 8 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+11-8Lines changed: 11 additions & 8 deletions
diff --git a/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+4Lines changed: 4 additions & 0 deletions b/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+4Lines changed: 4 additions & 0 deletions
@@ -198,6 +198,7 @@ def __init__(
             A Llama instance.
         """
         self.verbose = verbose
+        self._stack = contextlib.ExitStack()
 
         set_verbose(verbose)
 
@@ -365,8 +366,6 @@ def __init__(
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
-        self._stack = contextlib.ExitStack()
-
         self._model = self._stack.enter_context(
             contextlib.closing(
                 _LlamaModel(
@@ -420,6 +419,15 @@ def __init__(
                 raise RuntimeError(
                     f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
                 )
+
+            def free_lora_adapter():
+                if self._lora_adapter is None:
+                    return
+                llama_cpp.llama_lora_adapter_free(self._lora_adapter)
+                self._lora_adapter = None
+
+            self._stack.callback(free_lora_adapter)
+
             assert self._ctx.ctx is not None
             if llama_cpp.llama_lora_adapter_set(
                 self._ctx.ctx, self._lora_adapter, self.lora_scale
@@ -2085,14 +2093,9 @@ def pooling_type(self) -> str:
 
     def close(self) -> None:
         """Explicitly free the model from memory."""
-        if hasattr(self,'_stack'):
-            if self._stack is not None:
-                self._stack.close()
+        self._stack.close()
 
     def __del__(self) -> None:
-        if hasattr(self,'_lora_adapter'):
-            if self._lora_adapter is not None:
-                llama_cpp.llama_lora_adapter_free(self._lora_adapter)
         self.close()
 
     @staticmethod
 
@@ -41,6 +41,10 @@ test = [
     "pytest>=7.4.0",
     "httpx>=0.24.1",
     "scipy>=1.10",
+    "fastapi>=0.100.0",
+    "sse-starlette>=1.6.1",
+    "starlette-context>=0.3.6,<0.4",
+    "pydantic-settings>=2.0.1",
 ]
 dev = [
     "black>=23.3.0",