@@ -273,11 +273,11 @@ class llama_token_data_array(Structure):
273273# } llama_batch;
274274class llama_batch (Structure ):
275275 """Input data for llama_decode
276-
276+
277277 A llama_batch object can contain input about one or many sequences
278-
278+
279279 The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
280-
280+
281281 Attributes:
282282 token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
283283 embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
@@ -890,18 +890,121 @@ def llama_model_apply_lora_from_file(
890890# //
891891
892892
893- # // Returns the number of tokens in the KV cache
894- # LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
895- # "avoid using this, it will be removed in the future, instead - count the tokens in user code");
893+ # // Information associated with an individual cell in the KV cache view.
894+ # struct llama_kv_cache_view_cell {
895+ # // The position for this cell. Takes KV cache shifts into account.
896+ # // May be negative if the cell is not populated.
897+ # llama_pos pos;
898+ # };
899+ class llama_kv_cache_view_cell (Structure ):
900+ _fields_ = [("pos" , llama_pos )]
901+
902+
903+ # // An updateable view of the KV cache.
904+ # struct llama_kv_cache_view {
905+ # // Number of KV cache cells. This will be the same as the context size.
906+ # int32_t n_cells;
907+
908+ # // Maximum number of sequences that can exist in a cell. It's not an error
909+ # // if there are more sequences in a cell than this value, however they will
910+ # // not be visible in the view cells_sequences.
911+ # int32_t n_max_seq;
912+
913+ # // Number of tokens in the cache. For example, if there are two populated
914+ # // cells, the first with 1 sequence id in it and the second with 2 sequence
915+ # // ids then you'll have 3 tokens.
916+ # int32_t token_count;
917+
918+ # // Number of populated cache cells.
919+ # int32_t used_cells;
920+
921+ # // Maximum contiguous empty slots in the cache.
922+ # int32_t max_contiguous;
923+
924+ # // Index to the start of the max_contiguous slot range. Can be negative
925+ # // when cache is full.
926+ # int32_t max_contiguous_idx;
927+
928+ # // Information for an individual cell.
929+ # struct llama_kv_cache_view_cell * cells;
930+
931+
932+ # // The sequences for each cell. There will be n_max_seq items per cell.
933+ # llama_seq_id * cells_sequences;
934+ # };
935+ class llama_kv_cache_view (Structure ):
936+ _fields_ = [
937+ ("n_cells" , c_int32 ),
938+ ("n_max_seq" , c_int32 ),
939+ ("token_count" , c_int32 ),
940+ ("used_cells" , c_int32 ),
941+ ("max_contiguous" , c_int32 ),
942+ ("max_contiguous_idx" , c_int32 ),
943+ ("cells" , POINTER (llama_kv_cache_view_cell )),
944+ ("cells_sequences" , POINTER (llama_seq_id )),
945+ ]
946+
947+
948+ # // Create an empty KV cache view. (use only for debugging purposes)
949+ # LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
950+ def llama_kv_cache_view_init (
951+ ctx : llama_context_p , n_max_seq : Union [c_int32 , int ]
952+ ) -> llama_kv_cache_view :
953+ """Create an empty KV cache view. (use only for debugging purposes)"""
954+ return _lib .llama_kv_cache_view_init (ctx , n_max_seq )
955+
956+
957+ _lib .llama_kv_cache_view_init .argtypes = [llama_context_p , c_int32 ]
958+ _lib .llama_kv_cache_view_init .restype = llama_kv_cache_view
959+
960+
961+ # // Free a KV cache view. (use only for debugging purposes)
962+ # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
963+ def llama_kv_cache_view_free (view : llama_kv_cache_view ):
964+ """Free a KV cache view. (use only for debugging purposes)"""
965+ return _lib .llama_kv_cache_view_free (view )
966+
967+
968+ _lib .llama_kv_cache_view_free .argtypes = [llama_kv_cache_view ]
969+ _lib .llama_kv_cache_view_free .restype = None
970+
971+
972+ # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
973+ # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
974+ def llama_kv_cache_view_update (ctx : llama_context_p , view : llama_kv_cache_view ):
975+ """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
976+ return _lib .llama_kv_cache_view_update (ctx , view )
977+
978+
979+ _lib .llama_kv_cache_view_update .argtypes = [llama_context_p , llama_kv_cache_view ]
980+ _lib .llama_kv_cache_view_update .restype = None
981+
982+
983+ # // Returns the number of tokens in the KV cache (slow, use only for debug)
984+ # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
985+ # LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
896986def llama_get_kv_cache_token_count (ctx : llama_context_p ) -> int :
897- """Returns the number of tokens in the KV cache"""
987+ """Returns the number of tokens in the KV cache (slow, use only for debug)
988+ If a KV cell has multiple sequences assigned to it, it will be counted multiple times
989+ """
898990 return _lib .llama_get_kv_cache_token_count (ctx )
899991
900992
901993_lib .llama_get_kv_cache_token_count .argtypes = [llama_context_p ]
902994_lib .llama_get_kv_cache_token_count .restype = c_int
903995
904996
997+ # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
998+ # LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
999+ def llama_get_kv_cache_used_cells (ctx : llama_context_p ) -> int :
1000+ """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1001+ return _lib .llama_get_kv_cache_used_cells (ctx )
1002+
1003+
1004+ _lib .llama_get_kv_cache_used_cells .argtypes = [llama_context_p ]
1005+ _lib .llama_get_kv_cache_used_cells .restype = c_int
1006+
1007+
9051008# // Clear the KV cache
9061009# LLAMA_API void llama_kv_cache_clear(
9071010# struct llama_context * ctx);
@@ -1205,8 +1308,9 @@ def llama_batch_get_one(
12051308 seq_id : llama_seq_id ,
12061309) -> llama_batch :
12071310 """Return batch for single sequence of tokens starting at pos_0
1208-
1209- NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it"""
1311+
1312+ NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
1313+ """
12101314 return _lib .llama_batch_get_one (tokens , n_tokens , pos_0 , seq_id )
12111315
12121316
@@ -1290,7 +1394,8 @@ def llama_set_n_threads(
12901394):
12911395 """Set the number of threads used for decoding
12921396 n_threads is the number of threads used for generation (single token)
1293- n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)"""
1397+ n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
1398+ """
12941399 return _lib .llama_set_n_threads (ctx , n_threads , n_threads_batch )
12951400
12961401
@@ -1540,7 +1645,8 @@ def llama_token_to_piece(
15401645 """Token Id -> Piece.
15411646 Uses the vocabulary in the provided context.
15421647 Does not write null terminator to the buffer.
1543- User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens."""
1648+ User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
1649+ """
15441650 return _lib .llama_token_to_piece (model , token , buf , length )
15451651
15461652
@@ -1626,7 +1732,8 @@ def llama_sample_repetition_penalties(
16261732 penalty_present : Union [c_float , float ],
16271733):
16281734 """Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
1629- Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."""
1735+ Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
1736+ """
16301737 return _lib .llama_sample_repetition_penalties (
16311738 ctx ,
16321739 candidates ,
0 commit comments