Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit fa83cc5

Browse filesBrowse files
committed
Update llama.cpp
Fix build examples Exclude examples directory Revert cmake changes Try actions/checkout@v4 Try to update submodules Revert Update llama.cpp Fix build examples Exclude examples directory Revert cmake changes Try actions/checkout@v4 Try to update submodules Revert
1 parent ddbd10c commit fa83cc5
Copy full SHA for fa83cc5

File tree

Expand file treeCollapse file tree

5 files changed

+145
-39
lines changed
Filter options
Expand file treeCollapse file tree

5 files changed

+145
-39
lines changed

‎.github/workflows/test.yaml

Copy file name to clipboardExpand all lines: .github/workflows/test.yaml
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
python-version: ["3.8", "3.9", "3.10", "3.11"]
1818

1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
2121
with:
2222
submodules: "true"
2323
- name: Set up Python ${{ matrix.python-version }}

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+47-11Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,14 @@ def __init__(
230230
n_batch: int = 512,
231231
n_threads: Optional[int] = None,
232232
n_threads_batch: Optional[int] = None,
233+
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED,
233234
rope_freq_base: float = 0.0,
234235
rope_freq_scale: float = 0.0,
236+
yarn_ext_factor: float = float("nan"),
237+
yarn_attn_factor: float = 1.0,
238+
yarn_beta_fast: float = 32.0,
239+
yarn_beta_slow: float = 1.0,
240+
yarn_orig_ctx: int = 0,
235241
mul_mat_q: bool = True,
236242
f16_kv: bool = True,
237243
logits_all: bool = False,
@@ -255,30 +261,30 @@ def __init__(
255261
256262
Args:
257263
model_path: Path to the model.
258-
seed: Random seed. -1 for random.
259-
n_ctx: Maximum context size.
260-
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
261264
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
262-
main_gpu: Main GPU to use.
263-
tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
265+
main_gpu: The GPU that is used for scratch and small tensors.
266+
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
267+
vocab_only: Only load the vocabulary no weights.
268+
use_mmap: Use mmap if possible.
269+
use_mlock: Force the system to keep the model in RAM.
270+
seed: Random seed. -1 for random.
271+
n_ctx: Context size.
272+
n_batch: Batch size for prompt processing (must be >= 32 to use BLAS)
273+
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
274+
n_threads_batch: Number of threads to use for batch processing. If None, use n_threads.
275+
rope_scaling_type: Type of rope scaling to use.
264276
rope_freq_base: Base frequency for rope sampling.
265277
rope_freq_scale: Scale factor for rope sampling.
266-
low_vram: Use low VRAM mode.
267278
mul_mat_q: if true, use experimental mul_mat_q kernels
268279
f16_kv: Use half-precision for key/value cache.
269280
logits_all: Return logits for all tokens, not just the last token.
270-
vocab_only: Only load the vocabulary no weights.
271-
use_mmap: Use mmap if possible.
272-
use_mlock: Force the system to keep the model in RAM.
273281
embedding: Embedding mode only.
274-
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
275282
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
276283
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
277284
lora_path: Path to a LoRA file to apply to the model.
278285
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
279286
chat_format: String specifying the chat format to use when calling create_chat_completion.
280287
verbose: Print verbose output to stderr.
281-
kwargs: Unused keyword arguments (for additional backwards compatibility).
282288
283289
Raises:
284290
ValueError: If the model path does not exist.
@@ -332,12 +338,30 @@ def __init__(
332338
self.context_params.n_batch = self.n_batch
333339
self.context_params.n_threads = self.n_threads
334340
self.context_params.n_threads_batch = self.n_threads_batch
341+
self.context_params.rope_scaling_type = (
342+
rope_scaling_type if rope_scaling_type is not None else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
343+
)
335344
self.context_params.rope_freq_base = (
336345
rope_freq_base if rope_freq_base != 0.0 else 0
337346
)
338347
self.context_params.rope_freq_scale = (
339348
rope_freq_scale if rope_freq_scale != 0.0 else 0
340349
)
350+
self.context_params.yarn_ext_factor = (
351+
yarn_ext_factor if yarn_ext_factor != 0.0 else 0
352+
)
353+
self.context_params.yarn_attn_factor = (
354+
yarn_attn_factor if yarn_attn_factor != 0.0 else 0
355+
)
356+
self.context_params.yarn_beta_fast = (
357+
yarn_beta_fast if yarn_beta_fast != 0.0 else 0
358+
)
359+
self.context_params.yarn_beta_slow = (
360+
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
361+
)
362+
self.context_params.yarn_orig_ctx = (
363+
yarn_orig_ctx if yarn_orig_ctx != 0 else 0
364+
)
341365
self.context_params.mul_mat_q = mul_mat_q
342366
self.context_params.f16_kv = f16_kv
343367
self.context_params.logits_all = logits_all
@@ -1671,8 +1695,14 @@ def __getstate__(self):
16711695
n_batch=self.n_batch,
16721696
n_threads=self.context_params.n_threads,
16731697
n_threads_batch=self.context_params.n_threads_batch,
1698+
rope_scaling_type=self.context_params.rope_scaling_type,
16741699
rope_freq_base=self.context_params.rope_freq_base,
16751700
rope_freq_scale=self.context_params.rope_freq_scale,
1701+
yarn_ext_factor=self.context_params.yarn_ext_factor,
1702+
yarn_attn_factor=self.context_params.yarn_attn_factor,
1703+
yarn_beta_fast=self.context_params.yarn_beta_fast,
1704+
yarn_beta_slow=self.context_params.yarn_beta_slow,
1705+
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
16761706
mul_mat_q=self.context_params.mul_mat_q,
16771707
f16_kv=self.context_params.f16_kv,
16781708
logits_all=self.context_params.logits_all,
@@ -1709,6 +1739,12 @@ def __setstate__(self, state):
17091739
n_threads_batch=state["n_threads_batch"],
17101740
rope_freq_base=state["rope_freq_base"],
17111741
rope_freq_scale=state["rope_freq_scale"],
1742+
rope_scaling_type=state["rope_scaling_type"],
1743+
yarn_ext_factor=state["yarn_ext_factor"],
1744+
yarn_attn_factor=state["yarn_attn_factor"],
1745+
yarn_beta_fast=state["yarn_beta_fast"],
1746+
yarn_beta_slow=state["yarn_beta_slow"],
1747+
yarn_orig_ctx=state["yarn_orig_ctx"],
17121748
mul_mat_q=state["mul_mat_q"],
17131749
f16_kv=state["f16_kv"],
17141750
logits_all=state["logits_all"],

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+26-2Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ def _load_shared_library(lib_base_name: str):
192192
LLAMA_FTYPE_MOSTLY_Q6_K = 18
193193
LLAMA_FTYPE_GUESSED = 1024
194194

195+
# enum llama_rope_scaling_type {
196+
# LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
197+
# LLAMA_ROPE_SCALING_NONE = 0,
198+
# LLAMA_ROPE_SCALING_LINEAR = 1,
199+
# LLAMA_ROPE_SCALING_YARN = 2,
200+
# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
201+
# };
202+
LLAMA_ROPE_SCALING_UNSPECIFIED = -1
203+
LLAMA_ROPE_SCALING_NONE = 0
204+
LLAMA_ROPE_SCALING_LINEAR = 1
205+
LLAMA_ROPE_SCALING_YARN = 2
206+
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
195207

196208
# typedef struct llama_token_data {
197209
# llama_token id; // token id
@@ -308,10 +320,16 @@ class llama_model_params(Structure):
308320
# uint32_t n_batch; // prompt processing maximum batch size
309321
# uint32_t n_threads; // number of threads to use for generation
310322
# uint32_t n_threads_batch; // number of threads to use for batch processing
323+
# int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
311324

312325
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
313-
# float rope_freq_base; // RoPE base frequency, 0 = from model
314-
# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
326+
# float rope_freq_base; // RoPE base frequency, 0 = from model
327+
# float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
328+
# float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
329+
# float yarn_attn_factor; // YaRN magnitude scaling factor
330+
# float yarn_beta_fast; // YaRN low correction dim
331+
# float yarn_beta_slow; // YaRN high correction dim
332+
# uint32_t yarn_orig_ctx; // YaRN original context size
315333

316334

317335
# // Keep the booleans together to avoid misalignment during copy-by-value.
@@ -327,8 +345,14 @@ class llama_context_params(Structure):
327345
("n_batch", c_uint32),
328346
("n_threads", c_uint32),
329347
("n_threads_batch", c_uint32),
348+
("rope_scaling_type", c_int8),
330349
("rope_freq_base", c_float),
331350
("rope_freq_scale", c_float),
351+
("yarn_ext_factor", c_float),
352+
("yarn_attn_factor", c_float),
353+
("yarn_beta_fast", c_float),
354+
("yarn_beta_slow", c_float),
355+
("yarn_orig_ctx", c_uint32),
332356
("mul_mat_q", c_bool),
333357
("f16_kv", c_bool),
334358
("logits_all", c_bool),

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+70-24Lines changed: 70 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,7 @@ class Settings(BaseSettings):
4141
default=None,
4242
description="The alias of the model to use for generating completions.",
4343
)
44-
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
45-
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
46-
n_batch: int = Field(
47-
default=512, ge=1, description="The batch size to use per eval."
48-
)
44+
# Model Params
4945
n_gpu_layers: int = Field(
5046
default=0,
5147
ge=-1,
@@ -60,17 +56,6 @@ class Settings(BaseSettings):
6056
default=None,
6157
description="Split layers across multiple GPUs in proportion.",
6258
)
63-
rope_freq_base: float = Field(
64-
default=0.0, description="RoPE base frequency"
65-
)
66-
rope_freq_scale: float = Field(
67-
default=0.0, description="RoPE frequency scaling factor"
68-
)
69-
mul_mat_q: bool = Field(
70-
default=True, description="if true, use experimental mul_mat_q kernels"
71-
)
72-
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
73-
logits_all: bool = Field(default=True, description="Whether to return logits.")
7459
vocab_only: bool = Field(
7560
default=False, description="Whether to only return the vocabulary."
7661
)
@@ -82,17 +67,59 @@ class Settings(BaseSettings):
8267
default=llama_cpp.llama_mlock_supported(),
8368
description="Use mlock.",
8469
)
85-
embedding: bool = Field(default=True, description="Whether to use embeddings.")
70+
# Context Params
71+
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
72+
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
73+
n_batch: int = Field(
74+
default=512, ge=1, description="The batch size to use per eval."
75+
)
8676
n_threads: int = Field(
8777
default=max(multiprocessing.cpu_count() // 2, 1),
8878
ge=1,
8979
description="The number of threads to use.",
9080
)
81+
n_threads_batch: int = Field(
82+
default=max(multiprocessing.cpu_count() // 2, 1),
83+
ge=0,
84+
description="The number of threads to use when batch processing.",
85+
)
86+
rope_scaling_type: int = Field(
87+
default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
88+
)
89+
rope_freq_base: float = Field(
90+
default=0.0, description="RoPE base frequency"
91+
)
92+
rope_freq_scale: float = Field(
93+
default=0.0, description="RoPE frequency scaling factor"
94+
)
95+
yarn_ext_factor: float = Field(
96+
default=float("nan")
97+
)
98+
yarn_attn_factor: float = Field(
99+
default=1.0
100+
)
101+
yarn_beta_fast: float = Field(
102+
default=32.0
103+
)
104+
yarn_beta_slow: float = Field(
105+
default=1.0
106+
)
107+
yarn_orig_ctx: int = Field(
108+
default=0
109+
)
110+
mul_mat_q: bool = Field(
111+
default=True, description="if true, use experimental mul_mat_q kernels"
112+
)
113+
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
114+
logits_all: bool = Field(default=True, description="Whether to return logits.")
115+
embedding: bool = Field(default=True, description="Whether to use embeddings.")
116+
# Sampling Params
91117
last_n_tokens_size: int = Field(
92118
default=64,
93119
ge=0,
94120
description="Last n tokens to keep for repeat penalty calculation.",
95121
)
122+
# LoRA Params
96123
lora_base: Optional[str] = Field(
97124
default=None,
98125
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
@@ -101,14 +128,17 @@ class Settings(BaseSettings):
101128
default=None,
102129
description="Path to a LoRA file to apply to the model.",
103130
)
131+
# Backend Params
104132
numa: bool = Field(
105133
default=False,
106134
description="Enable NUMA support.",
107135
)
136+
# Chat Format Params
108137
chat_format: str = Field(
109138
default="llama-2",
110139
description="Chat format to use.",
111140
)
141+
# Cache Params
112142
cache: bool = Field(
113143
default=False,
114144
description="Use a cache to reduce processing times for evaluated prompts.",
@@ -121,9 +151,11 @@ class Settings(BaseSettings):
121151
default=2 << 30,
122152
description="The size of the cache in bytes. Only used if cache is True.",
123153
)
154+
# Misc
124155
verbose: bool = Field(
125156
default=True, description="Whether to print debug information."
126157
)
158+
# Server Params
127159
host: str = Field(default="localhost", description="Listen address")
128160
port: int = Field(default=8000, description="Listen port")
129161
interrupt_requests: bool = Field(
@@ -345,27 +377,41 @@ def create_app(settings: Optional[Settings] = None):
345377
global llama
346378
llama = llama_cpp.Llama(
347379
model_path=settings.model,
348-
seed=settings.seed,
349-
n_ctx=settings.n_ctx,
350-
n_batch=settings.n_batch,
380+
# Model Params
351381
n_gpu_layers=settings.n_gpu_layers,
352382
main_gpu=settings.main_gpu,
353383
tensor_split=settings.tensor_split,
384+
vocab_only=settings.vocab_only,
385+
use_mmap=settings.use_mmap,
386+
use_mlock=settings.use_mlock,
387+
# Context Params
388+
seed=settings.seed,
389+
n_ctx=settings.n_ctx,
390+
n_batch=settings.n_batch,
391+
n_threads=settings.n_threads,
392+
n_threads_batch=settings.n_threads_batch,
393+
rope_scaling_type=settings.rope_scaling_type,
354394
rope_freq_base=settings.rope_freq_base,
355395
rope_freq_scale=settings.rope_freq_scale,
396+
yarn_ext_factor=settings.yarn_ext_factor,
397+
yarn_attn_factor=settings.yarn_attn_factor,
398+
yarn_beta_fast=settings.yarn_beta_fast,
399+
yarn_beta_slow=settings.yarn_beta_slow,
400+
yarn_orig_ctx=settings.yarn_orig_ctx,
356401
mul_mat_q=settings.mul_mat_q,
357402
f16_kv=settings.f16_kv,
358403
logits_all=settings.logits_all,
359-
vocab_only=settings.vocab_only,
360-
use_mmap=settings.use_mmap,
361-
use_mlock=settings.use_mlock,
362404
embedding=settings.embedding,
363-
n_threads=settings.n_threads,
405+
# Sampling Params
364406
last_n_tokens_size=settings.last_n_tokens_size,
407+
# LoRA Params
365408
lora_base=settings.lora_base,
366409
lora_path=settings.lora_path,
410+
# Backend Params
367411
numa=settings.numa,
412+
# Chat Format Params
368413
chat_format=settings.chat_format,
414+
# Misc
369415
verbose=settings.verbose,
370416
)
371417
if settings.cache:

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.