227227# LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
228228# LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
229229# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
230+ # LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
230231# };
231232LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
232233LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
257258LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26
258259LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
259260LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
261+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
260262
261263
262264# // note: these values should be synchronized with ggml_rope
@@ -1357,6 +1359,12 @@ def llama_model_n_head(model: llama_model_p, /) -> int:
13571359 ...
13581360
13591361
1362+ # LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
1363+ @ctypes_function ("llama_model_n_head_kv" , [llama_model_p_ctypes ], ctypes .c_int32 )
1364+ def llama_model_n_head_kv (model : llama_model_p , / ) -> int :
1365+ ...
1366+
1367+
13601368# // Get the model's RoPE frequency scaling factor
13611369# LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
13621370@ctypes_function ("llama_model_rope_freq_scale_train" , [llama_model_p_ctypes ], ctypes .c_float )
@@ -3375,8 +3383,8 @@ class llama_sampler_i(ctypes.Structure):
33753383
33763384
33773385# struct llama_sampler {
3378- # struct llama_sampler_i * iface;
3379- # llama_sampler_context_t ctx;
3386+ # const struct llama_sampler_i * iface;
3387+ # llama_sampler_context_t ctx;
33803388# };
33813389class llama_sampler (ctypes .Structure ):
33823390 _fields_ = [
@@ -3410,6 +3418,18 @@ class llama_sampler(ctypes.Structure):
34103418
34113419
34123420# // mirror of llama_sampler_i:
3421+ # LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
3422+ @ctypes_function (
3423+ "llama_sampler_init" ,
3424+ [ctypes .POINTER (llama_sampler_i ), llama_sampler_context_t ],
3425+ llama_sampler_p_ctypes ,
3426+ )
3427+ def llama_sampler_init (
3428+ iface : ctypes .POINTER (llama_sampler_i ), ctx : llama_sampler_context_t , /
3429+ ) -> llama_sampler_p :
3430+ ...
3431+
3432+
34133433# LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
34143434@ctypes_function (
34153435 "llama_sampler_name" ,
@@ -3627,6 +3647,17 @@ def llama_sampler_init_xtc(
36273647 ...
36283648
36293649
3650+ # /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
3651+ # LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
3652+ @ctypes_function (
3653+ "llama_sampler_init_top_n_sigma" ,
3654+ [ctypes .c_float ],
3655+ llama_sampler_p_ctypes ,
3656+ )
3657+ def llama_sampler_init_top_n_sigma (n : float , / ) -> llama_sampler_p :
3658+ ...
3659+
3660+
36303661# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
36313662# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
36323663# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3685,6 +3716,43 @@ def llama_sampler_init_grammar(
36853716 ...
36863717
36873718
3719+ # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
3720+ # /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
3721+ # /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
3722+ # LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
3723+ # const struct llama_vocab * vocab,
3724+ # const char * grammar_str,
3725+ # const char * grammar_root,
3726+ # const char ** trigger_patterns,
3727+ # size_t num_trigger_patterns,
3728+ # const llama_token * trigger_tokens,
3729+ # size_t num_trigger_tokens);
3730+ @ctypes_function (
3731+ "llama_sampler_init_grammar_lazy_patterns" ,
3732+ [
3733+ llama_vocab_p_ctypes ,
3734+ ctypes .c_char_p ,
3735+ ctypes .c_char_p ,
3736+ ctypes .POINTER (ctypes .c_char_p ),
3737+ ctypes .c_size_t ,
3738+ ctypes .POINTER (llama_token ),
3739+ ctypes .c_size_t ,
3740+ ],
3741+ llama_sampler_p_ctypes ,
3742+ )
3743+ def llama_sampler_init_grammar_lazy_patterns (
3744+ vocab : llama_vocab_p ,
3745+ grammar_str : bytes ,
3746+ grammar_root : bytes ,
3747+ trigger_patterns : CtypesArray [bytes ],
3748+ num_trigger_patterns : int ,
3749+ trigger_tokens : CtypesArray [llama_token ],
3750+ num_trigger_tokens : int ,
3751+ / ,
3752+ ) -> llama_sampler_p :
3753+ ...
3754+
3755+
36883756# /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
36893757# LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
36903758# int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -3737,7 +3805,7 @@ def llama_sampler_init_dry(
37373805 dry_base : float ,
37383806 dry_allowed_length : int ,
37393807 dry_penalty_last_n : int ,
3740- seq_breakers : CtypesArray [ bytes ] ,
3808+ seq_breakers ,
37413809 num_breakers : int ,
37423810 / ,
37433811) -> llama_sampler_p :
0 commit comments