Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 36048d4

Browse filesBrowse files
committed
Update llama.cpp
1 parent 4474157 commit 36048d4
Copy full SHA for 36048d4

File tree

Expand file treeCollapse file tree

2 files changed

+120
-13
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+120
-13
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+119-12Lines changed: 119 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -273,11 +273,11 @@ class llama_token_data_array(Structure):
273273
# } llama_batch;
274274
class llama_batch(Structure):
275275
"""Input data for llama_decode
276-
276+
277277
A llama_batch object can contain input about one or many sequences
278-
278+
279279
The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
280-
280+
281281
Attributes:
282282
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
283283
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
@@ -890,18 +890,121 @@ def llama_model_apply_lora_from_file(
890890
# //
891891

892892

893-
# // Returns the number of tokens in the KV cache
894-
# LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
895-
# "avoid using this, it will be removed in the future, instead - count the tokens in user code");
893+
# // Information associated with an individual cell in the KV cache view.
894+
# struct llama_kv_cache_view_cell {
895+
# // The position for this cell. Takes KV cache shifts into account.
896+
# // May be negative if the cell is not populated.
897+
# llama_pos pos;
898+
# };
899+
class llama_kv_cache_view_cell(Structure):
900+
_fields_ = [("pos", llama_pos)]
901+
902+
903+
# // An updateable view of the KV cache.
904+
# struct llama_kv_cache_view {
905+
# // Number of KV cache cells. This will be the same as the context size.
906+
# int32_t n_cells;
907+
908+
# // Maximum number of sequences that can exist in a cell. It's not an error
909+
# // if there are more sequences in a cell than this value, however they will
910+
# // not be visible in the view cells_sequences.
911+
# int32_t n_max_seq;
912+
913+
# // Number of tokens in the cache. For example, if there are two populated
914+
# // cells, the first with 1 sequence id in it and the second with 2 sequence
915+
# // ids then you'll have 3 tokens.
916+
# int32_t token_count;
917+
918+
# // Number of populated cache cells.
919+
# int32_t used_cells;
920+
921+
# // Maximum contiguous empty slots in the cache.
922+
# int32_t max_contiguous;
923+
924+
# // Index to the start of the max_contiguous slot range. Can be negative
925+
# // when cache is full.
926+
# int32_t max_contiguous_idx;
927+
928+
# // Information for an individual cell.
929+
# struct llama_kv_cache_view_cell * cells;
930+
931+
932+
# // The sequences for each cell. There will be n_max_seq items per cell.
933+
# llama_seq_id * cells_sequences;
934+
# };
935+
class llama_kv_cache_view(Structure):
936+
_fields_ = [
937+
("n_cells", c_int32),
938+
("n_max_seq", c_int32),
939+
("token_count", c_int32),
940+
("used_cells", c_int32),
941+
("max_contiguous", c_int32),
942+
("max_contiguous_idx", c_int32),
943+
("cells", POINTER(llama_kv_cache_view_cell)),
944+
("cells_sequences", POINTER(llama_seq_id)),
945+
]
946+
947+
948+
# // Create an empty KV cache view. (use only for debugging purposes)
949+
# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
950+
def llama_kv_cache_view_init(
951+
ctx: llama_context_p, n_max_seq: Union[c_int32, int]
952+
) -> llama_kv_cache_view:
953+
"""Create an empty KV cache view. (use only for debugging purposes)"""
954+
return _lib.llama_kv_cache_view_init(ctx, n_max_seq)
955+
956+
957+
_lib.llama_kv_cache_view_init.argtypes = [llama_context_p, c_int32]
958+
_lib.llama_kv_cache_view_init.restype = llama_kv_cache_view
959+
960+
961+
# // Free a KV cache view. (use only for debugging purposes)
962+
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
963+
def llama_kv_cache_view_free(view: llama_kv_cache_view):
964+
"""Free a KV cache view. (use only for debugging purposes)"""
965+
return _lib.llama_kv_cache_view_free(view)
966+
967+
968+
_lib.llama_kv_cache_view_free.argtypes = [llama_kv_cache_view]
969+
_lib.llama_kv_cache_view_free.restype = None
970+
971+
972+
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
973+
# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
974+
def llama_kv_cache_view_update(ctx: llama_context_p, view: llama_kv_cache_view):
975+
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
976+
return _lib.llama_kv_cache_view_update(ctx, view)
977+
978+
979+
_lib.llama_kv_cache_view_update.argtypes = [llama_context_p, llama_kv_cache_view]
980+
_lib.llama_kv_cache_view_update.restype = None
981+
982+
983+
# // Returns the number of tokens in the KV cache (slow, use only for debug)
984+
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
985+
# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
896986
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
897-
"""Returns the number of tokens in the KV cache"""
987+
"""Returns the number of tokens in the KV cache (slow, use only for debug)
988+
If a KV cell has multiple sequences assigned to it, it will be counted multiple times
989+
"""
898990
return _lib.llama_get_kv_cache_token_count(ctx)
899991

900992

901993
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
902994
_lib.llama_get_kv_cache_token_count.restype = c_int
903995

904996

997+
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
998+
# LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
999+
def llama_get_kv_cache_used_cells(ctx: llama_context_p) -> int:
1000+
"""Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1001+
return _lib.llama_get_kv_cache_used_cells(ctx)
1002+
1003+
1004+
_lib.llama_get_kv_cache_used_cells.argtypes = [llama_context_p]
1005+
_lib.llama_get_kv_cache_used_cells.restype = c_int
1006+
1007+
9051008
# // Clear the KV cache
9061009
# LLAMA_API void llama_kv_cache_clear(
9071010
# struct llama_context * ctx);
@@ -1205,8 +1308,9 @@ def llama_batch_get_one(
12051308
seq_id: llama_seq_id,
12061309
) -> llama_batch:
12071310
"""Return batch for single sequence of tokens starting at pos_0
1208-
1209-
NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it"""
1311+
1312+
NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
1313+
"""
12101314
return _lib.llama_batch_get_one(tokens, n_tokens, pos_0, seq_id)
12111315

12121316

@@ -1290,7 +1394,8 @@ def llama_set_n_threads(
12901394
):
12911395
"""Set the number of threads used for decoding
12921396
n_threads is the number of threads used for generation (single token)
1293-
n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)"""
1397+
n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
1398+
"""
12941399
return _lib.llama_set_n_threads(ctx, n_threads, n_threads_batch)
12951400

12961401

@@ -1540,7 +1645,8 @@ def llama_token_to_piece(
15401645
"""Token Id -> Piece.
15411646
Uses the vocabulary in the provided context.
15421647
Does not write null terminator to the buffer.
1543-
User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens."""
1648+
User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
1649+
"""
15441650
return _lib.llama_token_to_piece(model, token, buf, length)
15451651

15461652

@@ -1626,7 +1732,8 @@ def llama_sample_repetition_penalties(
16261732
penalty_present: Union[c_float, float],
16271733
):
16281734
"""Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
1629-
Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."""
1735+
Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
1736+
"""
16301737
return _lib.llama_sample_repetition_penalties(
16311738
ctx,
16321739
candidates,

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.