@@ -273,11 +273,11 @@ class llama_token_data_array(Structure):
273
273
# } llama_batch;
274
274
class llama_batch (Structure ):
275
275
"""Input data for llama_decode
276
-
276
+
277
277
A llama_batch object can contain input about one or many sequences
278
-
278
+
279
279
The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
280
-
280
+
281
281
Attributes:
282
282
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
283
283
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
@@ -890,18 +890,121 @@ def llama_model_apply_lora_from_file(
890
890
# //
891
891
892
892
893
- # // Returns the number of tokens in the KV cache
894
- # LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
895
- # "avoid using this, it will be removed in the future, instead - count the tokens in user code");
893
+ # // Information associated with an individual cell in the KV cache view.
894
+ # struct llama_kv_cache_view_cell {
895
+ # // The position for this cell. Takes KV cache shifts into account.
896
+ # // May be negative if the cell is not populated.
897
+ # llama_pos pos;
898
+ # };
899
+ class llama_kv_cache_view_cell (Structure ):
900
+ _fields_ = [("pos" , llama_pos )]
901
+
902
+
903
+ # // An updateable view of the KV cache.
904
+ # struct llama_kv_cache_view {
905
+ # // Number of KV cache cells. This will be the same as the context size.
906
+ # int32_t n_cells;
907
+
908
+ # // Maximum number of sequences that can exist in a cell. It's not an error
909
+ # // if there are more sequences in a cell than this value, however they will
910
+ # // not be visible in the view cells_sequences.
911
+ # int32_t n_max_seq;
912
+
913
+ # // Number of tokens in the cache. For example, if there are two populated
914
+ # // cells, the first with 1 sequence id in it and the second with 2 sequence
915
+ # // ids then you'll have 3 tokens.
916
+ # int32_t token_count;
917
+
918
+ # // Number of populated cache cells.
919
+ # int32_t used_cells;
920
+
921
+ # // Maximum contiguous empty slots in the cache.
922
+ # int32_t max_contiguous;
923
+
924
+ # // Index to the start of the max_contiguous slot range. Can be negative
925
+ # // when cache is full.
926
+ # int32_t max_contiguous_idx;
927
+
928
+ # // Information for an individual cell.
929
+ # struct llama_kv_cache_view_cell * cells;
930
+
931
+
932
+ # // The sequences for each cell. There will be n_max_seq items per cell.
933
+ # llama_seq_id * cells_sequences;
934
+ # };
935
+ class llama_kv_cache_view (Structure ):
936
+ _fields_ = [
937
+ ("n_cells" , c_int32 ),
938
+ ("n_max_seq" , c_int32 ),
939
+ ("token_count" , c_int32 ),
940
+ ("used_cells" , c_int32 ),
941
+ ("max_contiguous" , c_int32 ),
942
+ ("max_contiguous_idx" , c_int32 ),
943
+ ("cells" , POINTER (llama_kv_cache_view_cell )),
944
+ ("cells_sequences" , POINTER (llama_seq_id )),
945
+ ]
946
+
947
+
948
+ # // Create an empty KV cache view. (use only for debugging purposes)
949
+ # LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
950
+ def llama_kv_cache_view_init (
951
+ ctx : llama_context_p , n_max_seq : Union [c_int32 , int ]
952
+ ) -> llama_kv_cache_view :
953
+ """Create an empty KV cache view. (use only for debugging purposes)"""
954
+ return _lib .llama_kv_cache_view_init (ctx , n_max_seq )
955
+
956
+
957
+ _lib .llama_kv_cache_view_init .argtypes = [llama_context_p , c_int32 ]
958
+ _lib .llama_kv_cache_view_init .restype = llama_kv_cache_view
959
+
960
+
961
+ # // Free a KV cache view. (use only for debugging purposes)
962
+ # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
963
+ def llama_kv_cache_view_free (view : llama_kv_cache_view ):
964
+ """Free a KV cache view. (use only for debugging purposes)"""
965
+ return _lib .llama_kv_cache_view_free (view )
966
+
967
+
968
+ _lib .llama_kv_cache_view_free .argtypes = [llama_kv_cache_view ]
969
+ _lib .llama_kv_cache_view_free .restype = None
970
+
971
+
972
+ # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
973
+ # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
974
+ def llama_kv_cache_view_update (ctx : llama_context_p , view : llama_kv_cache_view ):
975
+ """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
976
+ return _lib .llama_kv_cache_view_update (ctx , view )
977
+
978
+
979
+ _lib .llama_kv_cache_view_update .argtypes = [llama_context_p , llama_kv_cache_view ]
980
+ _lib .llama_kv_cache_view_update .restype = None
981
+
982
+
983
+ # // Returns the number of tokens in the KV cache (slow, use only for debug)
984
+ # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
985
+ # LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
896
986
def llama_get_kv_cache_token_count (ctx : llama_context_p ) -> int :
897
- """Returns the number of tokens in the KV cache"""
987
+ """Returns the number of tokens in the KV cache (slow, use only for debug)
988
+ If a KV cell has multiple sequences assigned to it, it will be counted multiple times
989
+ """
898
990
return _lib .llama_get_kv_cache_token_count (ctx )
899
991
900
992
901
993
_lib .llama_get_kv_cache_token_count .argtypes = [llama_context_p ]
902
994
_lib .llama_get_kv_cache_token_count .restype = c_int
903
995
904
996
997
+ # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
998
+ # LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
999
+ def llama_get_kv_cache_used_cells (ctx : llama_context_p ) -> int :
1000
+ """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1001
+ return _lib .llama_get_kv_cache_used_cells (ctx )
1002
+
1003
+
1004
+ _lib .llama_get_kv_cache_used_cells .argtypes = [llama_context_p ]
1005
+ _lib .llama_get_kv_cache_used_cells .restype = c_int
1006
+
1007
+
905
1008
# // Clear the KV cache
906
1009
# LLAMA_API void llama_kv_cache_clear(
907
1010
# struct llama_context * ctx);
@@ -1205,8 +1308,9 @@ def llama_batch_get_one(
1205
1308
seq_id : llama_seq_id ,
1206
1309
) -> llama_batch :
1207
1310
"""Return batch for single sequence of tokens starting at pos_0
1208
-
1209
- NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it"""
1311
+
1312
+ NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
1313
+ """
1210
1314
return _lib .llama_batch_get_one (tokens , n_tokens , pos_0 , seq_id )
1211
1315
1212
1316
@@ -1290,7 +1394,8 @@ def llama_set_n_threads(
1290
1394
):
1291
1395
"""Set the number of threads used for decoding
1292
1396
n_threads is the number of threads used for generation (single token)
1293
- n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)"""
1397
+ n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
1398
+ """
1294
1399
return _lib .llama_set_n_threads (ctx , n_threads , n_threads_batch )
1295
1400
1296
1401
@@ -1540,7 +1645,8 @@ def llama_token_to_piece(
1540
1645
"""Token Id -> Piece.
1541
1646
Uses the vocabulary in the provided context.
1542
1647
Does not write null terminator to the buffer.
1543
- User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens."""
1648
+ User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
1649
+ """
1544
1650
return _lib .llama_token_to_piece (model , token , buf , length )
1545
1651
1546
1652
@@ -1626,7 +1732,8 @@ def llama_sample_repetition_penalties(
1626
1732
penalty_present : Union [c_float , float ],
1627
1733
):
1628
1734
"""Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
1629
- Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."""
1735
+ Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
1736
+ """
1630
1737
return _lib .llama_sample_repetition_penalties (
1631
1738
ctx ,
1632
1739
candidates ,
0 commit comments