jasonacox
diff --git a/‎CHANGELOG.md
Copy file name to clipboardExpand all lines: CHANGELOG.md
+14Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md
Copy file name to clipboardExpand all lines: CHANGELOG.md
+14Lines changed: 14 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Copy file name to clipboardExpand all lines: CMakeLists.txt
+12-5Lines changed: 12 additions & 5 deletions b/‎CMakeLists.txt
Copy file name to clipboardExpand all lines: CMakeLists.txt
+12-5Lines changed: 12 additions & 5 deletions
diff --git a/‎README.md
Copy file name to clipboardExpand all lines: README.md
+14-7Lines changed: 14 additions & 7 deletions b/‎README.md
Copy file name to clipboardExpand all lines: README.md
+14-7Lines changed: 14 additions & 7 deletions
diff --git a/‎docker/cuda_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/cuda_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion b/‎docker/cuda_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/cuda_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/open_llama/Dockerfile
Copy file name to clipboardExpand all lines: docker/open_llama/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion b/‎docker/open_llama/Dockerfile
Copy file name to clipboardExpand all lines: docker/open_llama/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/openblas_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/openblas_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion b/‎docker/openblas_simple/Dockerfile
Copy file name to clipboardExpand all lines: docker/openblas_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/__init__.py
Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion b/‎llama_cpp/__init__.py
Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2Lines changed: 2 additions & 0 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2Lines changed: 2 additions & 0 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+10-5Lines changed: 10 additions & 5 deletions b/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+10-5Lines changed: 10 additions & 5 deletions
diff --git a/‎llama_cpp/server/__main__.py
Copy file name to clipboardExpand all lines: llama_cpp/server/__main__.py
+1-1Lines changed: 1 addition & 1 deletion b/‎llama_cpp/server/__main__.py
Copy file name to clipboardExpand all lines: llama_cpp/server/__main__.py
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+1-2Lines changed: 1 addition & 2 deletions b/‎pyproject.toml
Copy file name to clipboardExpand all lines: pyproject.toml
+1-2Lines changed: 1 addition & 2 deletions
diff --git a/‎vendor/llama.cpp
Copy file name to clipboard b/‎vendor/llama.cpp
Copy file name to clipboard
@@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- Update llama.cpp to 8781013ef654270cbead3e0011e33a6d690fb168
+- Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
+
+## [0.2.6]
+
+- Update llama.cpp to 80291a1d02a07f7f66666fb576c5b1e75aa48b46
+
+## [0.2.5]
+
+- Fix docker images missing starlette-context dependency by @abetlen in 22917989003c5e67623d54ab45affa1e0e475410
+- Fix loading dll in Windows Isolation Containers by @abetlen in 847466562573191efa655753d9252f308c4fbdb0
+- Fix build issue on m1 macs by @abetlen in dbd3a6d1ed8416a8fd800127251e730153afa305
+- Update docs to gguf and add hw acceleration docs for server by @jasonacox in #688
+
 ## [0.2.4]
 
 - Add NUMA support. **NOTE** low level api users must call llama_backend_init at the start of their programs by abetlen in f4090a0bb2a2a25acfe28d31c82cc1aa273bedee
 
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4...3.22)
+cmake_minimum_required(VERSION 3.21)
 
 project(llama_cpp)
 
@@ -7,15 +7,13 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python
 
 if (LLAMA_BUILD)
     set(BUILD_SHARED_LIBS "On")
-    if (APPLE)
-        # Need to disable these llama.cpp flags on Apple
+    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        # Need to disable these llama.cpp flags on Apple x86_64,
         # otherwise users may encounter invalid instruction errors
         set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
         set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
         set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
         set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mtune=native")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
     endif()
     add_subdirectory(vendor/llama.cpp)
     install(
@@ -35,4 +33,13 @@ if (LLAMA_BUILD)
         FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
         RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
+    # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
+    install(
+        FILES $<TARGET_RUNTIME_DLLS:llama>
+        DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    )
+    install(
+        FILES $<TARGET_RUNTIME_DLLS:llama>
+        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    )
 endif()
@@ -106,14 +106,14 @@ Below is a short example demonstrating how to use the high-level API to generate
 
 ```python
 >>> from llama_cpp import Llama
->>> llm = Llama(model_path="./models/7B/ggml-model.bin")
+>>> llm = Llama(model_path="./models/7B/llama-model.gguf")
 >>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
 >>> print(output)
 {
   "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
   "object": "text_completion",
   "created": 1679561337,
-  "model": "./models/7B/ggml-model.bin",
+  "model": "./models/7B/llama-model.gguf",
   "choices": [
     {
       "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
@@ -136,15 +136,15 @@ The context window of the Llama models determines the maximum number of tokens t
 For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
 
 ```python
-llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
+llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048)
 ```
 
 ### Loading llama-2 70b
 
 Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
 
 ```python
-llm = Llama(model_path="./models/70B/ggml-model.bin", n_gqa=8)
+llm = Llama(model_path="./models/70B/llama-model.gguf", n_gqa=8)
 ```
 
 ## Web Server
@@ -156,17 +156,24 @@ To install the server package and get started:
 
 ```bash
 pip install llama-cpp-python[server]
-python3 -m llama_cpp.server --model models/7B/ggml-model.bin
+python3 -m llama_cpp.server --model models/7B/llama-model.gguf
+```
+Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
+
+```bash
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]
+python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
 ```
 
 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
 
+
 ## Docker image
 
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 
 ```bash
-docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest
 ```
 [Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) 
 
@@ -183,7 +190,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 >>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
 >>> params = llama_cpp.llama_context_default_params()
 # use bytes for char * params
->>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
+>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
 >>> ctx = llama_cpp.llama_new_context_with_model(model, params)
 >>> max_tokens = params.n_ctx
 # use ctypes arrays for array params
 
@@ -18,7 +18,7 @@ ENV CUDA_DOCKER_ARCH=all
 ENV LLAMA_CUBLAS=1
 
 # Install depencencies
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
 # Install llama-cpp-python (build with cuda)
 RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
 
@@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
     ninja-build \
     build-essential
 
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
 # Perform the conditional installations based on the image
 RUN echo "Image: ${IMAGE}" && \
 
@@ -7,7 +7,7 @@ COPY . .
 
 # Install the package
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
-RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
 RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
 
 
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.4"
+__version__ = "0.2.6"
@@ -437,6 +437,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         n_tokens = llama_cpp.llama_tokenize_with_model(
             self.model,
             text,
+            len(text),
             tokens,
             n_ctx,
             add_bos,
@@ -447,6 +448,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
             n_tokens = llama_cpp.llama_tokenize_with_model(
                 self.model,
                 text,
+                len(text),
                 tokens,
                 n_tokens,
                 add_bos,
 
@@ -24,7 +24,7 @@
 # Load the library
 def _load_shared_library(lib_base_name: str):
     # Construct the paths to the possible shared library names
-    _base_path = pathlib.Path(__file__).parent.resolve()
+    _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths: List[pathlib.Path] = []
@@ -58,7 +58,7 @@ def _load_shared_library(lib_base_name: str):
         if "CUDA_PATH" in os.environ:
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
-        cdll_args["winmode"] = 0
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
 
     # Try to load the shared library, handling potential errors
     for _lib_path in _lib_paths:
@@ -950,42 +950,47 @@ def llama_token_nl(ctx: llama_context_p) -> llama_token:
 # LLAMA_API int llama_tokenize(
 #         struct llama_context * ctx,
 #                   const char * text,
+#                          int   text_len,
 #                  llama_token * tokens,
 #                          int   n_max_tokens,
 #                         bool   add_bos);
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
+    text_len: Union[c_int, int],
     tokens,  # type: Array[llama_token]
     n_max_tokens: Union[c_int, int],
     add_bos: Union[c_bool, int],
 ) -> int:
-    return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
+    return _lib.llama_tokenize(ctx, text, text_len, tokens, n_max_tokens, add_bos)
 
 
-_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
+_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, c_int, llama_token_p, c_int, c_bool]
 _lib.llama_tokenize.restype = c_int
 
 
 # LLAMA_API int llama_tokenize_with_model(
 #     const struct llama_model * model,
 #                   const char * text,
+#                          int   text_len,
 #                  llama_token * tokens,
 #                          int   n_max_tokens,
 #                         bool   add_bos);
 def llama_tokenize_with_model(
     model: llama_model_p,
     text: bytes,
+    text_len: Union[c_int, int],
     tokens,  # type: Array[llama_token]
     n_max_tokens: Union[c_int, int],
     add_bos: Union[c_bool, bool],
 ) -> int:
-    return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
+    return _lib.llama_tokenize_with_model(model, text, text_len, tokens, n_max_tokens, add_bos)
 
 
 _lib.llama_tokenize_with_model.argtypes = [
     llama_model_p,
     c_char_p,
+    c_int,
     llama_token_p,
     c_int,
     c_bool,
 
@@ -88,7 +88,7 @@ def parse_bool_arg(arg):
                 f"--{name}",
                 dest=name,
                 type=parse_bool_arg,
-                help=f"Disable {description}",
+                help=f"{description}",
             )
 
     args = parser.parse_args()
 
@@ -54,9 +54,8 @@ all = [
 [tool.scikit-build]
 wheel.packages = ["llama_cpp"]
 cmake.verbose = true
-cmake.minimum-version = "3.12"
+cmake.minimum-version = "3.21"
 minimum-version = "0.5"
-ninja.make-fallback = false
 sdist.exclude = [".git", "vendor/llama.cpp/.git"]
 
 [tool.scikit-build.metadata.version]
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def parse_bool_arg(arg):`
`88`	`88`	`f"--{name}",`
`89`	`89`	`dest=name,`
`90`	`90`	`type=parse_bool_arg,`
`91`		`- help=f"Disable {description}",`
	`91`	`+ help=f"{description}",`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`args = parser.parse_args()`