Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 6a20293

Browse filesBrowse files
committed
Reorder init params to match llama.cpp order
1 parent c8f9b8a commit 6a20293
Copy full SHA for 6a20293

File tree

Expand file treeCollapse file tree

1 file changed

+23
-28
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+23
-28
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+23-28Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -214,54 +214,55 @@ def __init__(
214214
model_path: str,
215215
*,
216216
# NOTE: These parameters are likely to change in the future.
217+
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
217218
n_ctx: int = 512,
218-
n_parts: int = -1,
219+
n_batch: int = 512,
219220
n_gpu_layers: int = 0,
220-
seed: int = 1337,
221+
main_gpu: int = 0,
222+
tensor_split: Optional[List[float]] = None,
223+
rope_freq_base: float = 10000.0,
224+
rope_freq_scale: float = 1.0,
225+
low_vram: bool = False,
226+
mul_mat_q: bool = True,
221227
f16_kv: bool = True,
222228
logits_all: bool = False,
223229
vocab_only: bool = False,
224230
use_mmap: bool = True,
225231
use_mlock: bool = False,
226232
embedding: bool = False,
227233
n_threads: Optional[int] = None,
228-
n_batch: int = 512,
229234
last_n_tokens_size: int = 64,
230235
lora_base: Optional[str] = None,
231236
lora_path: Optional[str] = None,
232-
low_vram: bool = False,
233-
tensor_split: Optional[List[float]] = None,
234-
rope_freq_base: float = 10000.0,
235-
rope_freq_scale: float = 1.0,
236-
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
237-
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
238-
mul_mat_q: Optional[bool] = None,
239237
verbose: bool = True,
240238
**kwargs # type: ignore
241239
):
242240
"""Load a llama.cpp model from `model_path`.
243241
244242
Args:
245243
model_path: Path to the model.
246-
n_ctx: Maximum context size.
247-
n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
248244
seed: Random seed. -1 for random.
245+
n_ctx: Maximum context size.
246+
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
249247
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
248+
main_gpu: Main GPU to use.
249+
tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
250+
rope_freq_base: Base frequency for rope sampling.
251+
rope_freq_scale: Scale factor for rope sampling.
252+
low_vram: Use low VRAM mode.
253+
mul_mat_q: if true, use experimental mul_mat_q kernels
250254
f16_kv: Use half-precision for key/value cache.
251255
logits_all: Return logits for all tokens, not just the last token.
252256
vocab_only: Only load the vocabulary no weights.
253257
use_mmap: Use mmap if possible.
254258
use_mlock: Force the system to keep the model in RAM.
255259
embedding: Embedding mode only.
256260
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
257-
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
258261
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
259262
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
260263
lora_path: Path to a LoRA file to apply to the model.
261-
tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
262-
rope_freq_base: Base frequency for rope sampling.
263-
rope_freq_scale: Scale factor for rope sampling.
264264
verbose: Print verbose output to stderr.
265+
kwargs: Unused keyword arguments (for additional backwards compatibility).
265266
266267
Raises:
267268
ValueError: If the model path does not exist.
@@ -274,16 +275,20 @@ def __init__(
274275
self.model_path = model_path
275276

276277
self.params = llama_cpp.llama_context_default_params()
278+
self.params.seed = seed
277279
self.params.n_ctx = n_ctx
278280
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
279-
self.params.seed = seed
281+
self.params.main_gpu = main_gpu
282+
self.params.rope_freq_base = rope_freq_base
283+
self.params.rope_freq_scale = rope_freq_scale
284+
self.params.low_vram = low_vram
285+
self.params.mul_mat_q = mul_mat_q
280286
self.params.f16_kv = f16_kv
281287
self.params.logits_all = logits_all
282288
self.params.vocab_only = vocab_only
283289
self.params.use_mmap = use_mmap if lora_path is None else False
284290
self.params.use_mlock = use_mlock
285291
self.params.embedding = embedding
286-
self.params.low_vram = low_vram
287292

288293
self.tensor_split = tensor_split
289294
self._p_tensor_split = None
@@ -296,12 +301,6 @@ def __init__(
296301
) # keep a reference to the array so it is not gc'd
297302
self.params.tensor_split = self._c_tensor_split
298303

299-
self.params.rope_freq_base = rope_freq_base
300-
self.params.rope_freq_scale = rope_freq_scale
301-
302-
303-
if mul_mat_q is not None:
304-
self.params.mul_mat_q = mul_mat_q
305304

306305
self.last_n_tokens_size = last_n_tokens_size
307306
self.n_batch = min(n_ctx, n_batch)
@@ -313,10 +312,6 @@ def __init__(
313312
self.lora_base = lora_base
314313
self.lora_path = lora_path
315314

316-
### DEPRECATED ###
317-
self.n_parts = n_parts
318-
### DEPRECATED ###
319-
320315
if not os.path.exists(model_path):
321316
raise ValueError(f"Model path does not exist: {model_path}")
322317

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.