Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 84615ad

Browse filesBrowse files
committed
Add split_mode option. Closes abetlen#1085
1 parent 76aafa6 commit 84615ad
Copy full SHA for 84615ad

File tree

Expand file treeCollapse file tree

2 files changed

+10
-1
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+10
-1
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+6-1Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,7 @@ def __init__(
730730
*,
731731
# Model Params
732732
n_gpu_layers: int = 0,
733+
split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
733734
main_gpu: int = 0,
734735
tensor_split: Optional[List[float]] = None,
735736
vocab_only: bool = False,
@@ -799,7 +800,8 @@ def __init__(
799800
Args:
800801
model_path: Path to the model.
801802
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
802-
main_gpu: The GPU that is used for scratch and small tensors.
803+
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
804+
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
803805
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
804806
vocab_only: Only load the vocabulary no weights.
805807
use_mmap: Use mmap if possible.
@@ -850,6 +852,7 @@ def __init__(
850852
self.model_params.n_gpu_layers = (
851853
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
852854
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
855+
self.model_params.split_mode = split_mode
853856
self.model_params.main_gpu = main_gpu
854857
self.tensor_split = tensor_split
855858
self._c_tensor_split = None
@@ -2173,6 +2176,7 @@ def __getstate__(self):
21732176
model_path=self.model_path,
21742177
# Model Params
21752178
n_gpu_layers=self.model_params.n_gpu_layers,
2179+
split_mode=self.model_params.split_mode,
21762180
main_gpu=self.model_params.main_gpu,
21772181
tensor_split=self.tensor_split,
21782182
vocab_only=self.model_params.vocab_only,
@@ -2216,6 +2220,7 @@ def __setstate__(self, state):
22162220
model_path=state["model_path"],
22172221
# Model Params
22182222
n_gpu_layers=state["n_gpu_layers"],
2223+
split_mode=state["split_mode"],
22192224
main_gpu=state["main_gpu"],
22202225
tensor_split=state["tensor_split"],
22212226
vocab_only=state["vocab_only"],

‎llama_cpp/server/settings.py

Copy file name to clipboardExpand all lines: llama_cpp/server/settings.py
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ class ModelSettings(BaseSettings):
2828
ge=-1,
2929
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
3030
)
31+
split_mode: int = Field(
32+
default=llama_cpp.LLAMA_SPLIT_LAYER,
33+
description="The split mode to use.",
34+
)
3135
main_gpu: int = Field(
3236
default=0,
3337
ge=0,

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.