abetlen
diff --git a/‎CHANGELOG.md
Copy file name to clipboardExpand all lines: CHANGELOG.md
+14Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md
Copy file name to clipboardExpand all lines: CHANGELOG.md
+14Lines changed: 14 additions & 0 deletions
diff --git a/‎docker/cuda_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/cuda_simple/Dockerfile
+2-2Lines changed: 2 additions & 2 deletions b/‎docker/cuda_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/cuda_simple/Dockerfile
+2-2Lines changed: 2 additions & 2 deletions
diff --git a/‎docker/open_llama/Dockerfile
Copy file name to clipboardExpand all lines: docker/open_llama/Dockerfile
+3-3Lines changed: 3 additions & 3 deletions b/‎docker/open_llama/Dockerfile
Copy file name to clipboardExpand all lines: docker/open_llama/Dockerfile
+3-3Lines changed: 3 additions & 3 deletions
diff --git a/‎docker/openblas_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/openblas_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion b/‎docker/openblas_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/openblas_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion b/‎docker/simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/__init__.py
Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion b/‎llama_cpp/__init__.py
Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+6-3Lines changed: 6 additions & 3 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+6-3Lines changed: 6 additions & 3 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+66-27Lines changed: 66 additions & 27 deletions b/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+66-27Lines changed: 66 additions & 27 deletions
diff --git a/‎vendor/llama.cpp
Copy file name to clipboard b/‎vendor/llama.cpp
Copy file name to clipboard
@@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.85]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@398ede5efeb07b9adf9fbda7ea63f630d476a792
+- fix: Missing LoRA adapter after API change by @shamitv in #1630
+- fix(docker): Update Dockerfile BLAS options by @olivierdebauche in #1632
+- fix(docker): Fix GGML_CUDA param by @olivierdebauche in #1633
+- fix(docker): Update Dockerfile build options from `LLAMA_` to `GGML_` by @olivierdebauche in #1634
+- feat: FreeBSD compatibility by @yurivict in #1635
+
+## [0.2.84]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@4730faca618ff9cee0780580145e3cbe86f24876
+- fix: fix: Correcting run.sh filepath in Simple Docker implementation by @mashuk999 in #1626
+
 ## [0.2.83]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@081fe431aa8fb6307145c4feb3eed4f48cab19f8
 
@@ -15,13 +15,13 @@ COPY . .
 
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
-ENV LLAMA_CUBLAS=1
+ENV GGML_CUDA=1
 
 # Install depencencies
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
 # Install llama-cpp-python (build with cuda)
-RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
 
 # Run the server
 CMD python3 -m llama_cpp.server
@@ -20,13 +20,13 @@ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fa
 
 # Perform the conditional installations based on the image
 RUN echo "Image: ${IMAGE}" && \
-    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
+    if [ "${IMAGE}" = "python:3-slim-bookworm" ] ; then \
     echo "OpenBLAS install:" && \
     apt-get install -y --no-install-recommends libopenblas-dev && \
-    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
+    CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python --verbose; \
 else \
     echo "CuBLAS install:" && \
-    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
+    CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --verbose; \
 fi
 
 # Clean up apt cache
 
@@ -12,7 +12,7 @@ RUN apt update && apt install -y libopenblas-dev ninja-build build-essential pkg
 
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
-RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
+RUN CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
 
 # Run the server
 CMD python3 -m llama_cpp.server
@@ -35,4 +35,4 @@ ENV PORT=8000
 EXPOSE 8000
 
 # Run the server start script
-CMD ["/bin/sh", "/app/run.sh"]
+CMD ["/bin/sh", "/app/docker/simple/run.sh"]
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.83"
+__version__ = "0.2.85"
@@ -2088,11 +2088,14 @@ def pooling_type(self) -> str:
 
     def close(self) -> None:
         """Explicitly free the model from memory."""
-        self._stack.close()
+        if hasattr(self,'_stack'):
+            if self._stack is not None:
+                self._stack.close()
 
     def __del__(self) -> None:
-        if self._lora_adapter is not None:
-            llama_cpp.llama_lora_adapter_free(self._lora_adapter)
+        if hasattr(self,'_lora_adapter'):
+            if self._lora_adapter is not None:
+                llama_cpp.llama_lora_adapter_free(self._lora_adapter)
         self.close()
 
     @staticmethod
 
@@ -28,7 +28,7 @@ def _load_shared_library(lib_base_name: str):
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths: List[pathlib.Path] = []
     # Determine the file extension based on the platform
-    if sys.platform.startswith("linux"):
+    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
         _lib_paths += [
             _base_path / f"lib{lib_base_name}.so",
         ]
@@ -233,9 +233,6 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 # define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 LLAMA_DEFAULT_SEED = 0xFFFFFFFF
 
-# define LLAMA_MAX_RNG_STATE (64*1024)
-LLAMA_MAX_RNG_STATE = 64 * 1024
-
 # define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 LLAMA_FILE_MAGIC_GGLA = 0x67676C61
 
@@ -247,13 +244,13 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 
 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 7
-LLAMA_SESSION_VERSION = 7
+# define LLAMA_SESSION_VERSION 8
+LLAMA_SESSION_VERSION = 8
 
 # define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
-# define LLAMA_STATE_SEQ_VERSION 1
-LLAMA_STATE_SEQ_VERSION = 1
+# define LLAMA_STATE_SEQ_VERSION 2
+LLAMA_STATE_SEQ_VERSION = 2
 
 # struct llama_model;
 llama_model_p = NewType("llama_model_p", int)
@@ -1583,7 +1580,7 @@ def llama_lora_adapter_set(
     ...
 
 
-# // Remove a LoRA adapter from given context
+# // Remove a specific LoRA adapter from given context
 # // Return -1 if the adapter is not present in the context
 # LLAMA_API int32_t llama_lora_adapter_remove(
 #         struct llama_context * ctx,
@@ -1601,6 +1598,19 @@ def llama_lora_adapter_remove(
     ...
 
 
+# // Remove all LoRA adapters from given context
+# LLAMA_API void llama_lora_adapter_clear(
+#         struct llama_context * ctx);
+@ctypes_function(
+    "llama_lora_adapter_clear",
+    [llama_context_p_ctypes],
+    None,
+)
+def llama_lora_adapter_clear(ctx: llama_context_p, /):
+    """Remove all LoRA adapters from given context"""
+    ...
+
+
 # // Manually free a LoRA adapter
 # // Note: loaded adapters will be free when the associated model is deleted
 # LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
@@ -1992,17 +2002,17 @@ def llama_kv_cache_update(ctx: llama_context_p, /):
 # //
 
 
-# Returns the maximum size in bytes of the state (rng, logits, embedding
-# and kv_cache) - will often be smaller after compacting tokens
-# LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
+# // Returns the *actual* size in bytes of the state
+# // (rng, logits, embedding and kv_cache)
+# // Only use when saving the state, not when restoring it, otherwise the size may be too small.
+# LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
 @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_state_get_size(ctx: llama_context_p, /) -> int:
-    """Returns the maximum size in bytes of the state (rng, logits, embedding
-    and kv_cache) - will often be smaller after compacting tokens"""
+    """Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens"""
     ...
 
 
-# LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
+# LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
 #     "use llama_state_get_size instead");
 @ctypes_function("llama_get_state_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_get_state_size(ctx: llama_context_p, /) -> int:
@@ -2011,22 +2021,27 @@ def llama_get_state_size(ctx: llama_context_p, /) -> int:
     ...
 
 
-# Copies the state to the specified destination address.
-# Destination needs to have allocated enough memory.
-# Returns the number of bytes copied
+# // Copies the state to the specified destination address.
+# // Destination needs to have allocated enough memory.
+# // Returns the number of bytes copied
 # LLAMA_API size_t llama_state_get_data(
 #         struct llama_context * ctx,
-#                      uint8_t * dst);
+#                      uint8_t * dst,
+#                       size_t   size);
 @ctypes_function(
     "llama_state_get_data",
     [
         llama_context_p_ctypes,
         ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
     ],
     ctypes.c_size_t,
 )
 def llama_state_get_data(
-    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    /,
 ) -> int:
     """Copies the state to the specified destination address.
     Destination needs to have allocated enough memory.
@@ -2059,14 +2074,18 @@ def llama_copy_state_data(
 # // Returns the number of bytes read
 # LLAMA_API size_t llama_state_set_data(
 #         struct llama_context * ctx,
-#                const uint8_t * src);
+#                const uint8_t * src,
+#                       size_t   size);
 @ctypes_function(
     "llama_state_set_data",
-    [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
+    [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8), ctypes.c_size_t],
     ctypes.c_size_t,
 )
 def llama_state_set_data(
-    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    /,
 ) -> int:
     """Set the state reading from the specified address
     Returns the number of bytes read"""
@@ -2216,14 +2235,24 @@ def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> i
 # LLAMA_API size_t llama_state_seq_get_data(
 #         struct llama_context * ctx,
 #                      uint8_t * dst,
+#                       size_t   size,
 #                 llama_seq_id   seq_id);
 @ctypes_function(
     "llama_state_seq_get_data",
-    [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8), llama_seq_id],
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+    ],
     ctypes.c_size_t,
 )
 def llama_state_seq_get_data(
-    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], seq_id: llama_seq_id, /
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    seq_id: llama_seq_id,
+    /,
 ) -> int:
     """Copy the KV cache of a single sequence into the specified buffer"""
     ...
@@ -2236,14 +2265,24 @@ def llama_state_seq_get_data(
 # LLAMA_API size_t llama_state_seq_set_data(
 #         struct llama_context * ctx,
 #                const uint8_t * src,
+#                       size_t   size,
 #                 llama_seq_id   dest_seq_id);
 @ctypes_function(
     "llama_state_seq_set_data",
-    [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8), llama_seq_id],
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+    ],
     ctypes.c_size_t,
 )
 def llama_state_seq_set_data(
-    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], dest_seq_id: llama_seq_id, /
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    dest_seq_id: llama_seq_id,
+    /,
 ) -> int:
     """Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
     ...