Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 2920c4b

Browse filesBrowse files
committed
Update server params. Added lora_base, lora_path, low_vram, and main_gpu. Removed rms_norm_eps and n_gqa (deprecated in llama.cpp)
1 parent 6a20293 commit 2920c4b
Copy full SHA for 2920c4b

File tree

Expand file treeCollapse file tree

1 file changed

+44
-37
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+44
-37
lines changed

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+44-37Lines changed: 44 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,21 @@ class Settings(BaseSettings):
3434
default=None,
3535
description="The alias of the model to use for generating completions.",
3636
)
37+
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
3738
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
39+
n_batch: int = Field(
40+
default=512, ge=1, description="The batch size to use per eval."
41+
)
3842
n_gpu_layers: int = Field(
3943
default=0,
4044
ge=0,
4145
description="The number of layers to put on the GPU. The rest will be on the CPU.",
4246
)
47+
main_gpu: int = Field(
48+
default=0,
49+
ge=0,
50+
description="Main GPU to use.",
51+
)
4352
tensor_split: Optional[List[float]] = Field(
4453
default=None,
4554
description="Split layers across multiple GPUs in proportion.",
@@ -50,35 +59,45 @@ class Settings(BaseSettings):
5059
rope_freq_scale: float = Field(
5160
default=1.0, description="RoPE frequency scaling factor"
5261
)
53-
seed: int = Field(default=1337, description="Random seed. -1 for random.")
54-
n_batch: int = Field(
55-
default=512, ge=1, description="The batch size to use per eval."
62+
low_vram: bool = Field(
63+
default=False,
64+
description="Whether to use less VRAM. This will reduce performance.",
5665
)
57-
n_threads: int = Field(
58-
default=max(multiprocessing.cpu_count() // 2, 1),
59-
ge=1,
60-
description="The number of threads to use.",
66+
mul_mat_q: bool = Field(
67+
default=True, description="if true, use experimental mul_mat_q kernels"
6168
)
6269
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
63-
use_mlock: bool = Field(
64-
default=llama_cpp.llama_mlock_supported(),
65-
description="Use mlock.",
70+
logits_all: bool = Field(default=True, description="Whether to return logits.")
71+
vocab_only: bool = Field(
72+
default=False, description="Whether to only return the vocabulary."
6673
)
6774
use_mmap: bool = Field(
6875
default=llama_cpp.llama_mmap_supported(),
6976
description="Use mmap.",
7077
)
78+
use_mlock: bool = Field(
79+
default=llama_cpp.llama_mlock_supported(),
80+
description="Use mlock.",
81+
)
7182
embedding: bool = Field(default=True, description="Whether to use embeddings.")
72-
low_vram: bool = Field(
73-
default=False,
74-
description="Whether to use less VRAM. This will reduce performance.",
83+
n_threads: int = Field(
84+
default=max(multiprocessing.cpu_count() // 2, 1),
85+
ge=1,
86+
description="The number of threads to use.",
7587
)
7688
last_n_tokens_size: int = Field(
7789
default=64,
7890
ge=0,
7991
description="Last n tokens to keep for repeat penalty calculation.",
8092
)
81-
logits_all: bool = Field(default=True, description="Whether to return logits.")
93+
lora_base: Optional[str] = Field(
94+
default=None,
95+
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
96+
)
97+
lora_path: Optional[str] = Field(
98+
default=None,
99+
description="Path to a LoRA file to apply to the model.",
100+
)
82101
cache: bool = Field(
83102
default=False,
84103
description="Use a cache to reduce processing times for evaluated prompts.",
@@ -91,9 +110,6 @@ class Settings(BaseSettings):
91110
default=2 << 30,
92111
description="The size of the cache in bytes. Only used if cache is True.",
93112
)
94-
vocab_only: bool = Field(
95-
default=False, description="Whether to only return the vocabulary."
96-
)
97113
verbose: bool = Field(
98114
default=True, description="Whether to print debug information."
99115
)
@@ -103,18 +119,6 @@ class Settings(BaseSettings):
103119
default=True,
104120
description="Whether to interrupt requests when a new request is received.",
105121
)
106-
n_gqa: Optional[int] = Field(
107-
default=None,
108-
description="TEMPORARY: Set to 8 for Llama2 70B",
109-
)
110-
rms_norm_eps: Optional[float] = Field(
111-
default=None,
112-
description="TEMPORARY",
113-
)
114-
mul_mat_q: Optional[bool] = Field(
115-
default=None,
116-
description="TEMPORARY",
117-
)
118122

119123

120124
class ErrorResponse(TypedDict):
@@ -334,24 +338,27 @@ def create_app(settings: Optional[Settings] = None):
334338
global llama
335339
llama = llama_cpp.Llama(
336340
model_path=settings.model,
341+
seed=settings.seed,
342+
n_ctx=settings.n_ctx,
343+
n_batch=settings.n_batch,
337344
n_gpu_layers=settings.n_gpu_layers,
345+
main_gpu=settings.main_gpu,
338346
tensor_split=settings.tensor_split,
339347
rope_freq_base=settings.rope_freq_base,
340348
rope_freq_scale=settings.rope_freq_scale,
341-
seed=settings.seed,
349+
low_vram=settings.low_vram,
350+
mul_mat_q=settings.mul_mat_q,
342351
f16_kv=settings.f16_kv,
343-
use_mlock=settings.use_mlock,
352+
logits_all=settings.logits_all,
353+
vocab_only=settings.vocab_only,
344354
use_mmap=settings.use_mmap,
355+
use_mlock=settings.use_mlock,
345356
embedding=settings.embedding,
346-
logits_all=settings.logits_all,
347357
n_threads=settings.n_threads,
348-
n_batch=settings.n_batch,
349-
n_ctx=settings.n_ctx,
350358
last_n_tokens_size=settings.last_n_tokens_size,
351-
vocab_only=settings.vocab_only,
359+
lora_base=settings.lora_base,
360+
lora_path=settings.lora_path,
352361
verbose=settings.verbose,
353-
n_gqa=settings.n_gqa,
354-
rms_norm_eps=settings.rms_norm_eps,
355362
)
356363
if settings.cache:
357364
if settings.cache_type == "disk":

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.