@@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
581
581
# bool embeddings; // if true, extract embeddings (together with logits)
582
582
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
583
583
584
+
584
585
# // Abort callback
585
586
# // if it returns true, execution of llama_decode() will be aborted
586
587
# // currently works only with CPU execution
@@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
1006
1007
def llama_n_embd (model : llama_model_p , / ) -> int : ...
1007
1008
1008
1009
1010
+ # LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
1011
+ @ctypes_function ("llama_n_layer" , [llama_model_p_ctypes ], ctypes .c_int32 )
1012
+ def llama_n_layer (model : llama_model_p , / ) -> int : ...
1013
+
1014
+
1009
1015
# // Get the model's RoPE frequency scaling factor
1010
1016
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
1011
1017
@ctypes_function ("llama_rope_freq_scale_train" , [llama_model_p_ctypes ], ctypes .c_float )
@@ -1166,12 +1172,18 @@ def llama_model_quantize(
1166
1172
...
1167
1173
1168
1174
1175
+ # // Apply a LoRA adapter to a loaded model
1176
+ # // path_base_model is the path to a higher quality model to use as a base for
1177
+ # // the layers modified by the adapter. Can be NULL to use the current loaded model.
1178
+ # // The model needs to be reloaded before applying a new adapter, otherwise the adapter
1179
+ # // will be applied on top of the previous one
1180
+ # // Returns 0 on success
1169
1181
# LLAMA_API int32_t llama_model_apply_lora_from_file(
1170
1182
# const struct llama_model * model,
1171
- # const char * path_lora,
1172
- # float scale,
1173
- # const char * path_base_model,
1174
- # int32_t n_threads);
1183
+ # const char * path_lora,
1184
+ # float scale,
1185
+ # const char * path_base_model,
1186
+ # int32_t n_threads);
1175
1187
@ctypes_function (
1176
1188
"llama_model_apply_lora_from_file" ,
1177
1189
[
@@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
1190
1202
path_base_model : Union [ctypes .c_char_p , bytes , None ],
1191
1203
n_threads : Union [ctypes .c_int32 , int ],
1192
1204
/ ,
1193
- ) -> int : ...
1205
+ ) -> int :
1206
+ """Apply a LoRA adapter to a loaded model
1207
+ path_base_model is the path to a higher quality model to use as a base for
1208
+ the layers modified by the adapter. Can be NULL to use the current loaded model.
1209
+ The model needs to be reloaded before applying a new adapter, otherwise the adapter
1210
+ will be applied on top of the previous one
1211
+ Returns 0 on success"""
1212
+ ...
1213
+
1214
+
1215
+ # // Apply a loaded control vector to a llama_context, or if data is NULL, clear
1216
+ # // the currently loaded vector.
1217
+ # // n_embd should be the size of a single layer's control, and data should point
1218
+ # // to an n_embd x n_layers buffer starting from layer 1.
1219
+ # // il_start and il_end are the layer range the vector should apply to (both inclusive)
1220
+ # // See llama_control_vector_load in common to load a control vector.
1221
+ # LLAMA_API int32_t llama_control_vector_apply(
1222
+ # struct llama_context * lctx,
1223
+ # const float * data,
1224
+ # size_t len,
1225
+ # int32_t n_embd,
1226
+ # int32_t il_start,
1227
+ # int32_t il_end);
1228
+ @ctypes_function (
1229
+ "llama_control_vector_apply" ,
1230
+ [
1231
+ llama_context_p_ctypes ,
1232
+ ctypes .POINTER (ctypes .c_float ),
1233
+ ctypes .c_size_t ,
1234
+ ctypes .c_int32 ,
1235
+ ctypes .c_int32 ,
1236
+ ctypes .c_int32 ,
1237
+ ],
1238
+ ctypes .c_int32 ,
1239
+ )
1240
+ def llama_control_vector_apply (
1241
+ lctx : llama_context_p ,
1242
+ data : CtypesPointerOrRef [ctypes .c_float ],
1243
+ len : int ,
1244
+ n_embd : int ,
1245
+ il_start : int ,
1246
+ il_end : int ,
1247
+ / ,
1248
+ ) -> int :
1249
+ """Apply a loaded control vector to a llama_context, or if data is NULL, clear
1250
+ the currently loaded vector.
1251
+ n_embd should be the size of a single layer's control, and data should point
1252
+ to an n_embd x n_layers buffer starting from layer 1.
1253
+ il_start and il_end are the layer range the vector should apply to (both inclusive)
1254
+ See llama_control_vector_load in common to load a control vector."""
1255
+ ...
1194
1256
1195
1257
1196
1258
# //
@@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
1205
1267
# llama_pos pos;
1206
1268
# };
1207
1269
class llama_kv_cache_view_cell (ctypes .Structure ):
1270
+ """Information associated with an individual cell in the KV cache view.
1271
+
1272
+ Attributes:
1273
+ pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
1274
+ May be negative if the cell is not populated."""
1275
+
1208
1276
_fields_ = [("pos" , llama_pos )]
1209
1277
1210
1278
@@ -1985,7 +2053,7 @@ def llama_tokenize(
1985
2053
/ ,
1986
2054
) -> int :
1987
2055
"""Convert the provided text into tokens.
1988
-
2056
+
1989
2057
Args:
1990
2058
model: The model to use for tokenization.
1991
2059
text: The text to tokenize.
@@ -1995,10 +2063,11 @@ def llama_tokenize(
1995
2063
add_bos: Whether to add a beginning-of-sentence token.
1996
2064
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
1997
2065
Does not insert a leading space.
1998
-
2066
+
1999
2067
Returns:
2000
2068
Returns the number of tokens on success, no more than n_tokens_max
2001
- Returns a negative number on failure - the number of tokens that would have been returned"""
2069
+ Returns a negative number on failure - the number of tokens that would have been returned
2070
+ """
2002
2071
...
2003
2072
2004
2073
0 commit comments