Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 62804ee

Browse filesBrowse files
committed
feat: Update llama.cpp
1 parent 7e20e34 commit 62804ee
Copy full SHA for 62804ee

File tree

Expand file treeCollapse file tree

2 files changed

+71
-3
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+71
-3
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+70-2Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,15 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
488488
LLAMA_POOLING_TYPE_CLS = 2
489489
LLAMA_POOLING_TYPE_LAST = 3
490490

491+
# enum llama_attention_type {
492+
# LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
493+
# LLAMA_ATTENTION_TYPE_CAUSAL = 0,
494+
# LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
495+
# };
496+
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1
497+
LLAMA_ATTENTION_TYPE_CAUSAL = 0
498+
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
499+
491500
# enum llama_split_mode {
492501
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
493502
# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -775,6 +784,7 @@ class llama_model_params(ctypes.Structure):
775784

776785
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
777786
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
787+
# enum llama_attention_type attention_type; // attention type to use for embeddings
778788

779789
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
780790
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -817,6 +827,7 @@ class llama_context_params(ctypes.Structure):
817827
n_threads_batch (int): number of threads to use for batch processing
818828
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
819829
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
830+
attention_type (int): attention type to use for embeddings
820831
rope_freq_base (float): RoPE base frequency, 0 = from model
821832
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
822833
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -847,6 +858,7 @@ class llama_context_params(ctypes.Structure):
847858
n_threads_batch: int
848859
rope_scaling_type: int
849860
pooling_type: int
861+
attention_type: int
850862
rope_freq_base: float
851863
rope_freq_scale: float
852864
yarn_ext_factor: float
@@ -876,6 +888,7 @@ class llama_context_params(ctypes.Structure):
876888
("n_threads_batch", ctypes.c_uint32),
877889
("rope_scaling_type", ctypes.c_int),
878890
("pooling_type", ctypes.c_int),
891+
("attention_type", ctypes.c_int),
879892
("rope_freq_base", ctypes.c_float),
880893
("rope_freq_scale", ctypes.c_float),
881894
("yarn_ext_factor", ctypes.c_float),
@@ -2642,6 +2655,7 @@ def llama_token_eot(model: llama_model_p, /) -> int: ...
26422655
# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
26432656
# /// @return Returns the number of tokens on success, no more than n_tokens_max
26442657
# /// @return Returns a negative number on failure - the number of tokens that would have been returned
2658+
# /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
26452659
# /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
26462660
# /// as plaintext. Does not insert a leading space.
26472661
# LLAMA_API int32_t llama_tokenize(
@@ -2683,7 +2697,7 @@ def llama_tokenize(
26832697
text_len: The length of the text.
26842698
tokens: The tokens pointer must be large enough to hold the resulting tokens.
26852699
n_max_tokens: The maximum number of tokens to return.
2686-
add_special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
2700+
add_special: Allow adding special tokenns if the model is configured to do so.
26872701
parse_special: Allow parsing special tokens.
26882702
26892703
Returns:
@@ -2696,13 +2710,14 @@ def llama_tokenize(
26962710
# // Token Id -> Piece.
26972711
# // Uses the vocabulary in the provided context.
26982712
# // Does not write null terminator to the buffer.
2699-
# // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
2713+
# // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
27002714
# // @param special If true, special tokens are rendered in the output.
27012715
# LLAMA_API int32_t llama_token_to_piece(
27022716
# const struct llama_model * model,
27032717
# llama_token token,
27042718
# char * buf,
27052719
# int32_t length,
2720+
# int32_t lstrip,
27062721
# bool special);
27072722
@ctypes_function(
27082723
"llama_token_to_piece",
@@ -2711,6 +2726,7 @@ def llama_tokenize(
27112726
llama_token,
27122727
ctypes.c_char_p,
27132728
ctypes.c_int32,
2729+
ctypes.c_int32,
27142730
ctypes.c_bool,
27152731
],
27162732
ctypes.c_int32,
@@ -2720,6 +2736,7 @@ def llama_token_to_piece(
27202736
token: Union[llama_token, int],
27212737
buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
27222738
length: Union[ctypes.c_int, int],
2739+
lstrip: Union[ctypes.c_int, int],
27232740
special: Union[ctypes.c_bool, bool],
27242741
/,
27252742
) -> int:
@@ -2733,10 +2750,61 @@ def llama_token_to_piece(
27332750
token: The token to convert.
27342751
buf: The buffer to write the token to.
27352752
length: The length of the buffer.
2753+
lstrip: The number of leading spaces to skip.
27362754
special: If true, special tokens are rendered in the output."""
27372755
...
27382756

27392757

2758+
# /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
2759+
# /// @param text The char pointer must be large enough to hold the resulting text.
2760+
# /// @return Returns the number of chars/bytes on success, no more than text_len_max.
2761+
# /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
2762+
# /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
2763+
# /// @param unparse_special If true, special tokens are rendered in the output.
2764+
# LLAMA_API int32_t llama_detokenize(
2765+
# const struct llama_model * model,
2766+
# const llama_token * tokens,
2767+
# int32_t n_tokens,
2768+
# char * text,
2769+
# int32_t text_len_max,
2770+
# bool remove_special,
2771+
# bool unparse_special);
2772+
@ctypes_function(
2773+
"llama_detokenize",
2774+
[
2775+
llama_model_p_ctypes,
2776+
ctypes.POINTER(llama_token),
2777+
ctypes.c_int32,
2778+
ctypes.c_char_p,
2779+
ctypes.c_int32,
2780+
ctypes.c_bool,
2781+
ctypes.c_bool,
2782+
],
2783+
ctypes.c_int32,
2784+
)
2785+
def llama_detokenize(
2786+
model: llama_model_p,
2787+
tokens: CtypesArray[llama_token],
2788+
n_tokens: Union[ctypes.c_int, int],
2789+
text: bytes,
2790+
text_len_max: Union[ctypes.c_int, int],
2791+
remove_special: Union[ctypes.c_bool, bool],
2792+
unparse_special: Union[ctypes.c_bool, bool],
2793+
/,
2794+
) -> int:
2795+
"""Convert the provided tokens into text (inverse of llama_tokenize()).
2796+
2797+
Args:
2798+
model: The model to use for tokenization.
2799+
tokens: The tokens to convert.
2800+
n_tokens: The number of tokens.
2801+
text: The buffer to write the text to.
2802+
text_len_max: The length of the buffer.
2803+
remove_special: Allow to remove BOS and EOS tokens if model is configured to do so.
2804+
unparse_special: If true, special tokens are rendered in the output."""
2805+
...
2806+
2807+
27402808
# /// Apply chat template. Inspired by hf apply_chat_template() on python.
27412809
# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
27422810
# /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.