Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit cdf5976

Browse filesBrowse files
committed
Update llama.cpp
1 parent 7a536e8 commit cdf5976
Copy full SHA for cdf5976

File tree

Expand file treeCollapse file tree

4 files changed

+16
-5
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+16
-5
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def __init__(
8383
# NOTE: These parameters are likely to change in the future.
8484
n_ctx: int = 512,
8585
n_parts: int = -1,
86+
n_gpu_layers: int = 0,
8687
seed: int = 1337,
8788
f16_kv: bool = True,
8889
logits_all: bool = False,
@@ -129,6 +130,7 @@ def __init__(
129130
self.params = llama_cpp.llama_context_default_params()
130131
self.params.n_ctx = n_ctx
131132
self.params.n_parts = n_parts
133+
self.params.n_gpu_layers = n_gpu_layers
132134
self.params.seed = seed
133135
self.params.f16_kv = f16_kv
134136
self.params.logits_all = logits_all
@@ -1081,6 +1083,7 @@ def __getstate__(self):
10811083
model_path=self.model_path,
10821084
n_ctx=self.params.n_ctx,
10831085
n_parts=self.params.n_parts,
1086+
n_gpu_layers=self.params.n_gpu_layers,
10841087
seed=self.params.seed,
10851088
f16_kv=self.params.f16_kv,
10861089
logits_all=self.params.logits_all,
@@ -1100,6 +1103,7 @@ def __setstate__(self, state):
11001103
model_path=state["model_path"],
11011104
n_ctx=state["n_ctx"],
11021105
n_parts=state["n_parts"],
1106+
n_gpu_layers=state["n_gpu_layers"],
11031107
seed=state["seed"],
11041108
f16_kv=state["f16_kv"],
11051109
logits_all=state["logits_all"],

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+5-4Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def _load_shared_library(lib_base_name: str):
6868
_lib = _load_shared_library(_lib_base_name)
6969

7070
# C types
71-
LLAMA_FILE_VERSION = c_int(1)
71+
LLAMA_FILE_VERSION = c_int(2)
7272
LLAMA_FILE_MAGIC = b"ggjt"
7373
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
7474
LLAMA_SESSION_MAGIC = b"ggsn"
@@ -109,6 +109,7 @@ class llama_context_params(Structure):
109109
_fields_ = [
110110
("n_ctx", c_int), # text context
111111
("n_parts", c_int), # -1 for default
112+
("n_gpu_layers", c_int), # number of layers to store in VRAM
112113
("seed", c_int), # RNG seed, 0 for random
113114
("f16_kv", c_bool), # use fp16 for KV cache
114115
(
@@ -135,7 +136,7 @@ class llama_context_params(Structure):
135136
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
136137
4
137138
) # tok_embeddings.weight and output.weight are F16
138-
LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
139+
# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
139140
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
140141
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
141142
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
@@ -259,9 +260,9 @@ def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
259260
# Destination needs to have allocated enough memory.
260261
# Returns the number of bytes copied
261262
def llama_copy_state_data(
262-
ctx: llama_context_p, dest # type: Array[c_uint8]
263+
ctx: llama_context_p, dst # type: Array[c_uint8]
263264
) -> int:
264-
return _lib.llama_copy_state_data(ctx, dest)
265+
return _lib.llama_copy_state_data(ctx, dst)
265266

266267

267268
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+6Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ class Settings(BaseSettings):
1717
description="The path to the model to use for generating completions."
1818
)
1919
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
20+
n_gpu_layers: int = Field(
21+
default=0,
22+
ge=0,
23+
description="The number of layers to put on the GPU. The rest will be on the CPU.",
24+
)
2025
n_batch: int = Field(
2126
default=512, ge=1, description="The batch size to use per eval."
2227
)
@@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
8085
global llama
8186
llama = llama_cpp.Llama(
8287
model_path=settings.model,
88+
n_gpu_layers=settings.n_gpu_layers,
8389
f16_kv=settings.f16_kv,
8490
use_mlock=settings.use_mlock,
8591
use_mmap=settings.use_mmap,

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.