File tree Expand file tree Collapse file tree 2 files changed +11
-0
lines changed
Filter options
Expand file tree Collapse file tree 2 files changed +11
-0
lines changed
Original file line number Diff line number Diff line change @@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
135
135
llm = Llama(model_path = " ./models/7B/ggml-model.bin" , n_ctx = 2048 )
136
136
```
137
137
138
+ ### Loading llama-2 70b
139
+
140
+ Llama2 70b must set the ` n_gqa ` parameter (grouped-query attention factor) to 8 when loading:
141
+
142
+ ``` python
143
+ llm = Llama(model_path = " ./models/7B/ggml-model.bin" , n_gqa = 8 )
144
+ ```
145
+
138
146
## Web Server
139
147
140
148
` llama-cpp-python ` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
Original file line number Diff line number Diff line change @@ -216,6 +216,7 @@ def __init__(
216
216
embedding : bool = False ,
217
217
n_threads : Optional [int ] = None ,
218
218
n_batch : int = 512 ,
219
+ n_gqa : Optional [int ] = None , # must be 8 for llama2 70b
219
220
last_n_tokens_size : int = 64 ,
220
221
lora_base : Optional [str ] = None ,
221
222
lora_path : Optional [str ] = None ,
@@ -260,6 +261,8 @@ def __init__(
260
261
261
262
self .params = llama_cpp .llama_context_default_params ()
262
263
self .params .n_ctx = n_ctx
264
+ if n_gqa is not None :
265
+ self .params .n_gqa = n_gqa
263
266
self .params .n_gpu_layers = n_gpu_layers
264
267
self .params .seed = seed
265
268
self .params .f16_kv = f16_kv
You can’t perform that action at this time.
0 commit comments