Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 2292af5

Browse filesBrowse files
committed
feat: Update llama.cpp
1 parent 221edb9 commit 2292af5
Copy full SHA for 2292af5

File tree

Expand file treeCollapse file tree

4 files changed

+41
-37
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+41
-37
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+6-6Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def __init__(
6565
*,
6666
# Model Params
6767
n_gpu_layers: int = 0,
68-
split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
68+
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
6969
main_gpu: int = 0,
7070
tensor_split: Optional[List[float]] = None,
7171
vocab_only: bool = False,
@@ -78,7 +78,7 @@ def __init__(
7878
n_batch: int = 512,
7979
n_threads: Optional[int] = None,
8080
n_threads_batch: Optional[int] = None,
81-
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED,
81+
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
8282
rope_freq_base: float = 0.0,
8383
rope_freq_scale: float = 0.0,
8484
yarn_ext_factor: float = -1.0,
@@ -238,13 +238,13 @@ def __init__(
238238
for i, (k, v) in enumerate(kv_overrides.items()):
239239
self._kv_overrides_array[i].key = k.encode("utf-8")
240240
if isinstance(v, bool):
241-
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
241+
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
242242
self._kv_overrides_array[i].value.bool_value = v
243243
elif isinstance(v, int):
244-
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
244+
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
245245
self._kv_overrides_array[i].value.int_value = v
246246
elif isinstance(v, float):
247-
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
247+
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
248248
self._kv_overrides_array[i].value.float_value = v
249249
else:
250250
raise ValueError(f"Unknown value type for {k}: {v}")
@@ -270,7 +270,7 @@ def __init__(
270270
self.context_params.rope_scaling_type = (
271271
rope_scaling_type
272272
if rope_scaling_type is not None
273-
else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
273+
else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
274274
)
275275
self.context_params.rope_freq_base = (
276276
rope_freq_base if rope_freq_base != 0.0 else 0

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+28-28Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -279,35 +279,35 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
279279
LLAMA_FTYPE_GUESSED = 1024
280280

281281
# enum llama_rope_scaling_type {
282-
# LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
283-
# LLAMA_ROPE_SCALING_NONE = 0,
284-
# LLAMA_ROPE_SCALING_LINEAR = 1,
285-
# LLAMA_ROPE_SCALING_YARN = 2,
286-
# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
282+
# LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
283+
# LLAMA_ROPE_SCALING_TYPE_NONE = 0,
284+
# LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
285+
# LLAMA_ROPE_SCALING_TYPE_YARN = 2,
286+
# LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
287287
# };
288-
LLAMA_ROPE_SCALING_UNSPECIFIED = -1
289-
LLAMA_ROPE_SCALING_NONE = 0
290-
LLAMA_ROPE_SCALING_LINEAR = 1
291-
LLAMA_ROPE_SCALING_YARN = 2
292-
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
288+
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
289+
LLAMA_ROPE_SCALING_TYPE_NONE = 0
290+
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
291+
LLAMA_ROPE_SCALING_TYPE_YARN = 2
292+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
293293

294294
# enum llama_pooling_type {
295-
# LLAMA_POOLING_NONE = 0,
296-
# LLAMA_POOLING_MEAN = 1,
297-
# LLAMA_POOLING_CLS = 2,
295+
# LLAMA_POOLING_TYPE_NONE = 0,
296+
# LLAMA_POOLING_TYPE_MEAN = 1,
297+
# LLAMA_POOLING_TYPE_CLS = 2,
298298
# };
299-
LLAMA_POOLING_NONE = 0
300-
LLAMA_POOLING_MEAN = 1
301-
LLAMA_POOLING_CLS = 2
299+
LLAMA_POOLING_TYPE_NONE = 0
300+
LLAMA_POOLING_TYPE_MEAN = 1
301+
LLAMA_POOLING_TYPE_CLS = 2
302302

303303
# enum llama_split_mode {
304-
# LLAMA_SPLIT_NONE = 0, // single GPU
305-
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
306-
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
304+
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
305+
# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
306+
# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
307307
# };
308-
LLAMA_SPLIT_NONE = 0
309-
LLAMA_SPLIT_LAYER = 1
310-
LLAMA_SPLIT_ROW = 2
308+
LLAMA_SPLIT_MODE_NONE = 0
309+
LLAMA_SPLIT_MODE_LAYER = 1
310+
LLAMA_SPLIT_MODE_ROW = 2
311311

312312

313313
# typedef struct llama_token_data {
@@ -420,13 +420,13 @@ class llama_batch(ctypes.Structure):
420420

421421

422422
# enum llama_model_kv_override_type {
423-
# LLAMA_KV_OVERRIDE_INT,
424-
# LLAMA_KV_OVERRIDE_FLOAT,
425-
# LLAMA_KV_OVERRIDE_BOOL,
423+
# LLAMA_KV_OVERRIDE_TYPE_INT,
424+
# LLAMA_KV_OVERRIDE_TYPE_FLOAT,
425+
# LLAMA_KV_OVERRIDE_TYPE_BOOL,
426426
# };
427-
LLAMA_KV_OVERRIDE_INT = 0
428-
LLAMA_KV_OVERRIDE_FLOAT = 1
429-
LLAMA_KV_OVERRIDE_BOOL = 2
427+
LLAMA_KV_OVERRIDE_TYPE_INT = 0
428+
LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
429+
LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
430430

431431

432432
# struct llama_model_kv_override {

‎llama_cpp/server/settings.py

Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+6-2Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class ModelSettings(BaseSettings):
2929
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
3030
)
3131
split_mode: int = Field(
32-
default=llama_cpp.LLAMA_SPLIT_LAYER,
32+
default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
3333
description="The split mode to use.",
3434
)
3535
main_gpu: int = Field(
@@ -74,7 +74,7 @@ class ModelSettings(BaseSettings):
7474
ge=0,
7575
description="The number of threads to use when batch processing.",
7676
)
77-
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED)
77+
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
7878
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
7979
rope_freq_scale: float = Field(
8080
default=0.0, description="RoPE frequency scaling factor"
@@ -143,6 +143,10 @@ class ModelSettings(BaseSettings):
143143
default=None,
144144
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
145145
)
146+
hf_model_repo_id: Optional[str] = Field(
147+
default=None,
148+
description="The HuggingFace repo_id to use to load model files from",
149+
)
146150
# Speculative Decoding
147151
draft_model: Optional[str] = Field(
148152
default=None,

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.