Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 82b11c8

Browse filesBrowse files
authored
Merge pull request abetlen#460 from shouyiwang/tensor_split
Add support for llama.cpp's --tensor-split parameter
2 parents 6705f9b + 579f526 commit 82b11c8
Copy full SHA for 82b11c8

File tree

Expand file treeCollapse file tree

2 files changed

+20
-2
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+20
-2
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+12-1Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
from collections import deque, OrderedDict
2020

2121
import diskcache
22+
import ctypes
2223

2324
from . import llama_cpp
2425
from .llama_types import *
2526

2627
import numpy as np
2728
import numpy.typing as npt
2829

29-
3030
class BaseLlamaCache(ABC):
3131
"""Base cache class for a llama.cpp model."""
3232

@@ -207,6 +207,7 @@ def __init__(
207207
n_ctx: int = 512,
208208
n_parts: int = -1,
209209
n_gpu_layers: int = 0,
210+
tensor_split: list[float] = None,
210211
seed: int = 1337,
211212
f16_kv: bool = True,
212213
logits_all: bool = False,
@@ -248,12 +249,20 @@ def __init__(
248249
Returns:
249250
A Llama instance.
250251
"""
252+
if tensor_split is None:
253+
tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value
254+
255+
#Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
256+
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
257+
c_tensor_split = FloatArray(*tensor_split)
258+
251259
self.verbose = verbose
252260
self.model_path = model_path
253261

254262
self.params = llama_cpp.llama_context_default_params()
255263
self.params.n_ctx = n_ctx
256264
self.params.n_gpu_layers = n_gpu_layers
265+
self.params.tensor_split = c_tensor_split
257266
self.params.seed = seed
258267
self.params.f16_kv = f16_kv
259268
self.params.logits_all = logits_all
@@ -1490,6 +1499,7 @@ def __getstate__(self):
14901499
model_path=self.model_path,
14911500
n_ctx=self.params.n_ctx,
14921501
n_gpu_layers=self.params.n_gpu_layers,
1502+
tensor_split=self.params.tensor_split,
14931503
seed=self.params.seed,
14941504
f16_kv=self.params.f16_kv,
14951505
logits_all=self.params.logits_all,
@@ -1514,6 +1524,7 @@ def __setstate__(self, state):
15141524
n_ctx=state["n_ctx"],
15151525
n_parts=state["n_parts"],
15161526
n_gpu_layers=state["n_gpu_layers"],
1527+
tensor_split=state["tensor_split"],
15171528
seed=state["seed"],
15181529
f16_kv=state["f16_kv"],
15191530
logits_all=state["logits_all"],

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+8-1Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@ class Settings(BaseSettings):
3131
ge=0,
3232
description="The number of layers to put on the GPU. The rest will be on the CPU.",
3333
)
34-
seed: int = Field(default=1337, description="Random seed. -1 for random.")
34+
tensor_split: List[float] = Field(
35+
default=None,
36+
description="Split layers across multiple GPUs in proportion.",
37+
)
38+
seed: int = Field(
39+
default=1337, description="Random seed. -1 for random."
40+
)
3541
n_batch: int = Field(
3642
default=512, ge=1, description="The batch size to use per eval."
3743
)
@@ -111,6 +117,7 @@ def create_app(settings: Optional[Settings] = None):
111117
llama = llama_cpp.Llama(
112118
model_path=settings.model,
113119
n_gpu_layers=settings.n_gpu_layers,
120+
tensor_split=settings.tensor_split,
114121
seed=settings.seed,
115122
f16_kv=settings.f16_kv,
116123
use_mlock=settings.use_mlock,

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.