Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 6ce9502

Browse filesBrowse files
authored
Merge branch 'main' into configurable-chat-templates
2 parents 179f45c + 3afbf2e commit 6ce9502
Copy full SHA for 6ce9502

File tree

Expand file treeCollapse file tree

12 files changed

+59
-25
lines changed
Filter options
Expand file treeCollapse file tree

12 files changed

+59
-25
lines changed

‎CHANGELOG.md

Copy file name to clipboardExpand all lines: CHANGELOG.md
+14Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
- Update llama.cpp to 8781013ef654270cbead3e0011e33a6d690fb168
11+
- Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
12+
13+
## [0.2.6]
14+
15+
- Update llama.cpp to 80291a1d02a07f7f66666fb576c5b1e75aa48b46
16+
17+
## [0.2.5]
18+
19+
- Fix docker images missing starlette-context dependency by @abetlen in 22917989003c5e67623d54ab45affa1e0e475410
20+
- Fix loading dll in Windows Isolation Containers by @abetlen in 847466562573191efa655753d9252f308c4fbdb0
21+
- Fix build issue on m1 macs by @abetlen in dbd3a6d1ed8416a8fd800127251e730153afa305
22+
- Update docs to gguf and add hw acceleration docs for server by @jasonacox in #688
23+
1024
## [0.2.4]
1125

1226
- Add NUMA support. **NOTE** low level api users must call llama_backend_init at the start of their programs by abetlen in f4090a0bb2a2a25acfe28d31c82cc1aa273bedee

‎CMakeLists.txt

Copy file name to clipboardExpand all lines: CMakeLists.txt
+12-5Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.4...3.22)
1+
cmake_minimum_required(VERSION 3.21)
22

33
project(llama_cpp)
44

@@ -7,15 +7,13 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python
77

88
if (LLAMA_BUILD)
99
set(BUILD_SHARED_LIBS "On")
10-
if (APPLE)
11-
# Need to disable these llama.cpp flags on Apple
10+
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
11+
# Need to disable these llama.cpp flags on Apple x86_64,
1212
# otherwise users may encounter invalid instruction errors
1313
set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
1414
set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
1515
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
1616
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
17-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mtune=native")
18-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
1917
endif()
2018
add_subdirectory(vendor/llama.cpp)
2119
install(
@@ -35,4 +33,13 @@ if (LLAMA_BUILD)
3533
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
3634
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
3735
)
36+
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
37+
install(
38+
FILES $<TARGET_RUNTIME_DLLS:llama>
39+
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
40+
)
41+
install(
42+
FILES $<TARGET_RUNTIME_DLLS:llama>
43+
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
44+
)
3845
endif()

‎README.md

Copy file name to clipboardExpand all lines: README.md
+14-7Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,14 @@ Below is a short example demonstrating how to use the high-level API to generate
106106

107107
```python
108108
>>> from llama_cpp import Llama
109-
>>> llm = Llama(model_path="./models/7B/ggml-model.bin")
109+
>>> llm = Llama(model_path="./models/7B/llama-model.gguf")
110110
>>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
111111
>>> print(output)
112112
{
113113
"id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
114114
"object": "text_completion",
115115
"created": 1679561337,
116-
"model": "./models/7B/ggml-model.bin",
116+
"model": "./models/7B/llama-model.gguf",
117117
"choices": [
118118
{
119119
"text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
@@ -136,15 +136,15 @@ The context window of the Llama models determines the maximum number of tokens t
136136
For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
137137

138138
```python
139-
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
139+
llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048)
140140
```
141141

142142
### Loading llama-2 70b
143143

144144
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
145145

146146
```python
147-
llm = Llama(model_path="./models/70B/ggml-model.bin", n_gqa=8)
147+
llm = Llama(model_path="./models/70B/llama-model.gguf", n_gqa=8)
148148
```
149149

150150
## Web Server
@@ -156,17 +156,24 @@ To install the server package and get started:
156156

157157
```bash
158158
pip install llama-cpp-python[server]
159-
python3 -m llama_cpp.server --model models/7B/ggml-model.bin
159+
python3 -m llama_cpp.server --model models/7B/llama-model.gguf
160+
```
161+
Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
162+
163+
```bash
164+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]
165+
python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
160166
```
161167

162168
Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
163169

170+
164171
## Docker image
165172

166173
A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
167174

168175
```bash
169-
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
176+
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest
170177
```
171178
[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
172179

@@ -183,7 +190,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
183190
>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
184191
>>> params = llama_cpp.llama_context_default_params()
185192
# use bytes for char * params
186-
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
193+
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
187194
>>> ctx = llama_cpp.llama_new_context_with_model(model, params)
188195
>>> max_tokens = params.n_ctx
189196
# use ctypes arrays for array params

‎docker/cuda_simple/Dockerfile

Copy file name to clipboardExpand all lines: docker/cuda_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ ENV CUDA_DOCKER_ARCH=all
1818
ENV LLAMA_CUBLAS=1
1919

2020
# Install depencencies
21-
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
21+
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
2222

2323
# Install llama-cpp-python (build with cuda)
2424
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python

‎docker/open_llama/Dockerfile

Copy file name to clipboardExpand all lines: docker/open_llama/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
1414
ninja-build \
1515
build-essential
1616

17-
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
17+
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
1818

1919
# Perform the conditional installations based on the image
2020
RUN echo "Image: ${IMAGE}" && \

‎docker/openblas_simple/Dockerfile

Copy file name to clipboardExpand all lines: docker/openblas_simple/Dockerfile
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ COPY . .
77

88
# Install the package
99
RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
10-
RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
10+
RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
1111

1212
RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
1313

‎llama_cpp/__init__.py

Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.4"
4+
__version__ = "0.2.6"

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
437437
n_tokens = llama_cpp.llama_tokenize_with_model(
438438
self.model,
439439
text,
440+
len(text),
440441
tokens,
441442
n_ctx,
442443
add_bos,
@@ -447,6 +448,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
447448
n_tokens = llama_cpp.llama_tokenize_with_model(
448449
self.model,
449450
text,
451+
len(text),
450452
tokens,
451453
n_tokens,
452454
add_bos,

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+10-5Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# Load the library
2525
def _load_shared_library(lib_base_name: str):
2626
# Construct the paths to the possible shared library names
27-
_base_path = pathlib.Path(__file__).parent.resolve()
27+
_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
2828
# Searching for the library in the current directory under the name "libllama" (default name
2929
# for llamacpp) and "llama" (default name for this repo)
3030
_lib_paths: List[pathlib.Path] = []
@@ -58,7 +58,7 @@ def _load_shared_library(lib_base_name: str):
5858
if "CUDA_PATH" in os.environ:
5959
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
6060
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
61-
cdll_args["winmode"] = 0
61+
cdll_args["winmode"] = ctypes.RTLD_GLOBAL
6262

6363
# Try to load the shared library, handling potential errors
6464
for _lib_path in _lib_paths:
@@ -950,42 +950,47 @@ def llama_token_nl(ctx: llama_context_p) -> llama_token:
950950
# LLAMA_API int llama_tokenize(
951951
# struct llama_context * ctx,
952952
# const char * text,
953+
# int text_len,
953954
# llama_token * tokens,
954955
# int n_max_tokens,
955956
# bool add_bos);
956957
def llama_tokenize(
957958
ctx: llama_context_p,
958959
text: bytes,
960+
text_len: Union[c_int, int],
959961
tokens, # type: Array[llama_token]
960962
n_max_tokens: Union[c_int, int],
961963
add_bos: Union[c_bool, int],
962964
) -> int:
963-
return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
965+
return _lib.llama_tokenize(ctx, text, text_len, tokens, n_max_tokens, add_bos)
964966

965967

966-
_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
968+
_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, c_int, llama_token_p, c_int, c_bool]
967969
_lib.llama_tokenize.restype = c_int
968970

969971

970972
# LLAMA_API int llama_tokenize_with_model(
971973
# const struct llama_model * model,
972974
# const char * text,
975+
# int text_len,
973976
# llama_token * tokens,
974977
# int n_max_tokens,
975978
# bool add_bos);
976979
def llama_tokenize_with_model(
977980
model: llama_model_p,
978981
text: bytes,
982+
text_len: Union[c_int, int],
979983
tokens, # type: Array[llama_token]
980984
n_max_tokens: Union[c_int, int],
981985
add_bos: Union[c_bool, bool],
982986
) -> int:
983-
return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
987+
return _lib.llama_tokenize_with_model(model, text, text_len, tokens, n_max_tokens, add_bos)
984988

985989

986990
_lib.llama_tokenize_with_model.argtypes = [
987991
llama_model_p,
988992
c_char_p,
993+
c_int,
989994
llama_token_p,
990995
c_int,
991996
c_bool,

‎llama_cpp/server/__main__.py

Copy file name to clipboardExpand all lines: llama_cpp/server/__main__.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def parse_bool_arg(arg):
8888
f"--{name}",
8989
dest=name,
9090
type=parse_bool_arg,
91-
help=f"Disable {description}",
91+
help=f"{description}",
9292
)
9393

9494
args = parser.parse_args()

‎pyproject.toml

Copy file name to clipboardExpand all lines: pyproject.toml
+1-2Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,8 @@ all = [
5454
[tool.scikit-build]
5555
wheel.packages = ["llama_cpp"]
5656
cmake.verbose = true
57-
cmake.minimum-version = "3.12"
57+
cmake.minimum-version = "3.21"
5858
minimum-version = "0.5"
59-
ninja.make-fallback = false
6059
sdist.exclude = [".git", "vendor/llama.cpp/.git"]
6160

6261
[tool.scikit-build.metadata.version]

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.