227
227
# LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
228
228
# LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
229
229
# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
230
+ # LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
230
231
# };
231
232
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
232
233
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
257
258
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
258
259
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
259
260
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
261
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
260
262
261
263
262
264
# // note: these values should be synchronized with ggml_rope
@@ -1357,6 +1359,12 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
1357
1359
...
1358
1360
1359
1361
1362
+ # LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
1363
+ @ctypes_function ("llama_model_n_head_kv" , [llama_model_p_ctypes ], ctypes .c_int32 )
1364
+ def llama_model_n_head_kv (model : llama_model_p , / ) -> int :
1365
+ ...
1366
+
1367
+
1360
1368
# // Get the model's RoPE frequency scaling factor
1361
1369
# LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
1362
1370
@ctypes_function ("llama_model_rope_freq_scale_train" , [llama_model_p_ctypes ], ctypes .c_float )
@@ -3375,8 +3383,8 @@ class llama_sampler_i(ctypes.Structure):
3375
3383
3376
3384
3377
3385
# struct llama_sampler {
3378
- # struct llama_sampler_i * iface;
3379
- # llama_sampler_context_t ctx;
3386
+ # const struct llama_sampler_i * iface;
3387
+ # llama_sampler_context_t ctx;
3380
3388
# };
3381
3389
class llama_sampler (ctypes .Structure ):
3382
3390
_fields_ = [
@@ -3410,6 +3418,18 @@ class llama_sampler(ctypes.Structure):
3410
3418
3411
3419
3412
3420
# // mirror of llama_sampler_i:
3421
+ # LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
3422
+ @ctypes_function (
3423
+ "llama_sampler_init" ,
3424
+ [ctypes .POINTER (llama_sampler_i ), llama_sampler_context_t ],
3425
+ llama_sampler_p_ctypes ,
3426
+ )
3427
+ def llama_sampler_init (
3428
+ iface : ctypes .POINTER (llama_sampler_i ), ctx : llama_sampler_context_t , /
3429
+ ) -> llama_sampler_p :
3430
+ ...
3431
+
3432
+
3413
3433
# LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
3414
3434
@ctypes_function (
3415
3435
"llama_sampler_name" ,
@@ -3627,6 +3647,17 @@ def llama_sampler_init_xtc(
3627
3647
...
3628
3648
3629
3649
3650
+ # /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
3651
+ # LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
3652
+ @ctypes_function (
3653
+ "llama_sampler_init_top_n_sigma" ,
3654
+ [ctypes .c_float ],
3655
+ llama_sampler_p_ctypes ,
3656
+ )
3657
+ def llama_sampler_init_top_n_sigma (n : float , / ) -> llama_sampler_p :
3658
+ ...
3659
+
3660
+
3630
3661
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
3631
3662
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
3632
3663
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3685,6 +3716,43 @@ def llama_sampler_init_grammar(
3685
3716
...
3686
3717
3687
3718
3719
+ # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
3720
+ # /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
3721
+ # /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
3722
+ # LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
3723
+ # const struct llama_vocab * vocab,
3724
+ # const char * grammar_str,
3725
+ # const char * grammar_root,
3726
+ # const char ** trigger_patterns,
3727
+ # size_t num_trigger_patterns,
3728
+ # const llama_token * trigger_tokens,
3729
+ # size_t num_trigger_tokens);
3730
+ @ctypes_function (
3731
+ "llama_sampler_init_grammar_lazy_patterns" ,
3732
+ [
3733
+ llama_vocab_p_ctypes ,
3734
+ ctypes .c_char_p ,
3735
+ ctypes .c_char_p ,
3736
+ ctypes .POINTER (ctypes .c_char_p ),
3737
+ ctypes .c_size_t ,
3738
+ ctypes .POINTER (llama_token ),
3739
+ ctypes .c_size_t ,
3740
+ ],
3741
+ llama_sampler_p_ctypes ,
3742
+ )
3743
+ def llama_sampler_init_grammar_lazy_patterns (
3744
+ vocab : llama_vocab_p ,
3745
+ grammar_str : bytes ,
3746
+ grammar_root : bytes ,
3747
+ trigger_patterns : CtypesArray [bytes ],
3748
+ num_trigger_patterns : int ,
3749
+ trigger_tokens : CtypesArray [llama_token ],
3750
+ num_trigger_tokens : int ,
3751
+ / ,
3752
+ ) -> llama_sampler_p :
3753
+ ...
3754
+
3755
+
3688
3756
# /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
3689
3757
# LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
3690
3758
# int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -3737,7 +3805,7 @@ def llama_sampler_init_dry(
3737
3805
dry_base : float ,
3738
3806
dry_allowed_length : int ,
3739
3807
dry_penalty_last_n : int ,
3740
- seq_breakers : CtypesArray [ bytes ] ,
3808
+ seq_breakers ,
3741
3809
num_breakers : int ,
3742
3810
/ ,
3743
3811
) -> llama_sampler_p :
0 commit comments