Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d808fd4

Browse filesBrowse files
committed
Update llama.cpp
1 parent 53861c9 commit d808fd4
Copy full SHA for d808fd4

File tree

Expand file treeCollapse file tree

2 files changed

+63
-36
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+63
-36
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+62-35Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ class llama_token_data_array(Structure):
243243
# llama_token * token;
244244
# float * embd;
245245
# llama_pos * pos;
246+
# int32_t * n_seq_id;
246247
# llama_seq_id ** seq_id;
247248
# int8_t * logits;
248249

@@ -262,6 +263,7 @@ class llama_batch(Structure):
262263
("token", POINTER(llama_token)),
263264
("embd", c_float_p),
264265
("pos", POINTER(llama_pos)),
266+
("n_seq_id", POINTER(c_int32)),
265267
("seq_id", POINTER(POINTER(llama_seq_id))),
266268
("logits", POINTER(c_int8)),
267269
("all_pos_0", llama_pos),
@@ -312,7 +314,7 @@ class llama_model_params(Structure):
312314

313315

314316
# // Keep the booleans together to avoid misalignment during copy-by-value.
315-
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels
317+
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
316318
# bool f16_kv; // use fp16 for KV cache, fp32 otherwise
317319
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
318320
# bool embedding; // embedding mode only
@@ -349,6 +351,7 @@ class llama_context_params(Structure):
349351
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
350352
# bool quantize_output_tensor; // quantize output.weight
351353
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
354+
# bool pure; // disable k-quant mixtures and quantize all tensors to the same type
352355
# } llama_model_quantize_params;
353356
class llama_model_quantize_params(Structure):
354357
_fields_ = [
@@ -777,26 +780,21 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
777780
_lib.llama_get_kv_cache_token_count.restype = c_int
778781

779782

780-
# // Remove all tokens data of cells in [c0, c1)
781-
# // c0 < 0 : [0, c1]
782-
# // c1 < 0 : [c0, inf)
783-
# LLAMA_API void llama_kv_cache_tokens_rm(
784-
# struct llama_context * ctx,
785-
# int32_t c0,
786-
# int32_t c1);
787-
def llama_kv_cache_tokens_rm(
788-
ctx: llama_context_p, c0: Union[c_int32, int], c1: Union[c_int32, int]
789-
):
790-
return _lib.llama_kv_cache_tokens_rm(ctx, c0, c1)
783+
# // Clear the KV cache
784+
# LLAMA_API void llama_kv_cache_clear(
785+
# struct llama_context * ctx);
786+
def llama_kv_cache_clear(ctx: llama_context_p):
787+
return _lib.llama_kv_cache_clear(ctx)
791788

792789

793-
_lib.llama_kv_cache_tokens_rm.argtypes = [llama_context_p, c_int32, c_int32]
794-
_lib.llama_kv_cache_tokens_rm.restype = None
790+
_lib.llama_kv_cache_clear.argtypes = [llama_context_p]
791+
_lib.llama_kv_cache_clear.restype = None
795792

796793

797794
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
798-
# // p0 < 0 : [0, p1]
799-
# // p1 < 0 : [p0, inf)
795+
# // seq_id < 0 : match any sequence
796+
# // p0 < 0 : [0, p1]
797+
# // p1 < 0 : [p0, inf)
800798
# LLAMA_API void llama_kv_cache_seq_rm(
801799
# struct llama_context * ctx,
802800
# llama_seq_id seq_id,
@@ -1502,7 +1500,7 @@ def llama_sample_classifier_free_guidance(
15021500
_lib.llama_sample_classifier_free_guidance.restype = None
15031501

15041502

1505-
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1503+
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
15061504
# LLAMA_API void llama_sample_softmax(
15071505
# struct llama_context * ctx,
15081506
# llama_token_data_array * candidates);
@@ -1519,7 +1517,7 @@ def llama_sample_softmax(
15191517
_lib.llama_sample_softmax.restype = None
15201518

15211519

1522-
# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1520+
# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
15231521
# LLAMA_API void llama_sample_top_k(
15241522
# struct llama_context * ctx,
15251523
# llama_token_data_array * candidates,
@@ -1543,7 +1541,7 @@ def llama_sample_top_k(
15431541
_lib.llama_sample_top_k.restype = None
15441542

15451543

1546-
# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1544+
# /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
15471545
# LLAMA_API void llama_sample_top_p(
15481546
# struct llama_context * ctx,
15491547
# llama_token_data_array * candidates,
@@ -1567,7 +1565,31 @@ def llama_sample_top_p(
15671565
_lib.llama_sample_top_p.restype = None
15681566

15691567

1570-
# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1568+
# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1569+
# LLAMA_API void llama_sample_min_p(
1570+
# struct llama_context * ctx,
1571+
# llama_token_data_array * candidates,
1572+
# float p,
1573+
# size_t min_keep);
1574+
def llama_sample_min_p(
1575+
ctx: llama_context_p,
1576+
candidates, # type: _Pointer[llama_token_data_array]
1577+
p: Union[c_float, float],
1578+
min_keep: Union[c_size_t, int],
1579+
):
1580+
return _lib.llama_sample_min_p(ctx, candidates, p, min_keep)
1581+
1582+
1583+
_lib.llama_sample_min_p.argtypes = [
1584+
llama_context_p,
1585+
llama_token_data_array_p,
1586+
c_float,
1587+
c_size_t,
1588+
]
1589+
_lib.llama_sample_min_p.restype = None
1590+
1591+
1592+
# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
15711593
# LLAMA_API void llama_sample_tail_free(
15721594
# struct llama_context * ctx,
15731595
# llama_token_data_array * candidates,
@@ -1591,7 +1613,7 @@ def llama_sample_tail_free(
15911613
_lib.llama_sample_tail_free.restype = None
15921614

15931615

1594-
# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1616+
# /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
15951617
# LLAMA_API void llama_sample_typical(
15961618
# struct llama_context * ctx,
15971619
# llama_token_data_array * candidates,
@@ -1656,7 +1678,11 @@ def llama_sample_temperature(
16561678
_lib.llama_sample_temperature.restype = None
16571679

16581680

1659-
# LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
1681+
# /// @details Apply constraints from grammar
1682+
# LLAMA_API void llama_sample_grammar(
1683+
# struct llama_context * ctx,
1684+
# llama_token_data_array * candidates,
1685+
# const struct llama_grammar * grammar);
16601686
def llama_sample_grammar(
16611687
ctx: llama_context_p,
16621688
candidates, # type: _Pointer[llama_token_data_array]
@@ -1673,12 +1699,12 @@ def llama_sample_grammar(
16731699
_lib.llama_sample_grammar.restype = None
16741700

16751701

1676-
# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1677-
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1678-
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1679-
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1680-
# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1681-
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1702+
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1703+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1704+
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1705+
# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1706+
# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1707+
# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
16821708
# LLAMA_API llama_token llama_sample_token_mirostat(
16831709
# struct llama_context * ctx,
16841710
# llama_token_data_array * candidates,
@@ -1708,11 +1734,11 @@ def llama_sample_token_mirostat(
17081734
_lib.llama_sample_token_mirostat.restype = llama_token
17091735

17101736

1711-
# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1712-
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1713-
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1714-
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1715-
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1737+
# /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1738+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1739+
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1740+
# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1741+
# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
17161742
# LLAMA_API llama_token llama_sample_token_mirostat_v2(
17171743
# struct llama_context * ctx,
17181744
# llama_token_data_array * candidates,
@@ -1739,7 +1765,8 @@ def llama_sample_token_mirostat_v2(
17391765
_lib.llama_sample_token_mirostat_v2.restype = llama_token
17401766

17411767

1742-
# @details Selects the token with the highest probability.
1768+
# /// @details Selects the token with the highest probability.
1769+
# /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
17431770
# LLAMA_API llama_token llama_sample_token_greedy(
17441771
# struct llama_context * ctx,
17451772
# llama_token_data_array * candidates);
@@ -1757,7 +1784,7 @@ def llama_sample_token_greedy(
17571784
_lib.llama_sample_token_greedy.restype = llama_token
17581785

17591786

1760-
# @details Randomly selects a token from the candidates based on their probabilities.
1787+
# /// @details Randomly selects a token from the candidates based on their probabilities.
17611788
# LLAMA_API llama_token llama_sample_token(
17621789
# struct llama_context * ctx,
17631790
# llama_token_data_array * candidates);

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.