Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 7403e00

Browse filesBrowse files
committed
feat: Update llama.cpp
1 parent 7c4aead commit 7403e00
Copy full SHA for 7403e00

File tree

Expand file treeCollapse file tree

3 files changed

+139
-33
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+139
-33
lines changed

‎CMakeLists.txt

Copy file name to clipboardExpand all lines: CMakeLists.txt
+17-1Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ if (LLAMA_BUILD)
5555
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
5656
set(CMAKE_SKIP_RPATH FALSE)
5757

58+
# Enable building of the common library
59+
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
60+
5861
# Building llama
5962
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
6063
# Need to disable these llama.cpp flags on Apple x86_64,
@@ -106,7 +109,7 @@ if (LLAMA_BUILD)
106109
# Building llava
107110
add_subdirectory(vendor/llama.cpp/examples/llava)
108111
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
109-
# Set CUDA_ARCHITECTURES to OFF on windows
112+
# Set CUDA_ARCHITECTURES to OFF on Windows
110113
if (WIN32)
111114
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
112115
endif()
@@ -121,5 +124,18 @@ if (LLAMA_BUILD)
121124
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
122125
)
123126
endif()
127+
128+
# Fix for llava build: Add include directory for llama.h
129+
# Move these commands after the add_subdirectory call
130+
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
131+
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
132+
133+
if (BUILD_SHARED_LIBS)
134+
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
135+
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
136+
endif()
137+
138+
target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
139+
target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
124140
endif()
125141
endif()

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+121-31Lines changed: 121 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,8 @@ class llama_token_data(ctypes.Structure):
464464

465465

466466
# typedef struct llama_token_data_array {
467+
# // TODO: consider SoA
468+
# // NOTE: this pointer can be modified by the samplers
467469
# llama_token_data * data;
468470
# size_t size;
469471
# int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -507,8 +509,11 @@ class llama_token_data_array(ctypes.Structure):
507509
# // - token : the token ids of the input (used when embd is NULL)
508510
# // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
509511
# // - pos : the positions of the respective token in the sequence
512+
# // (if set to NULL, the token position will be tracked automatically by llama_decode)
510513
# // - seq_id : the sequence to which the respective token belongs
514+
# // (if set to NULL, the sequence ID will be assumed to be 0)
511515
# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
516+
# // (if set to NULL, only the logits for last token will be returned)
512517
# //
513518
# typedef struct llama_batch {
514519
# int32_t n_tokens;
@@ -519,16 +524,6 @@ class llama_token_data_array(ctypes.Structure):
519524
# int32_t * n_seq_id;
520525
# llama_seq_id ** seq_id;
521526
# int8_t * logits; // TODO: rename this to "output"
522-
523-
524-
# // NOTE: helpers for smooth API transition - can be deprecated in the future
525-
# // for future-proof code, use the above fields instead and ignore everything below
526-
# //
527-
# // pos[i] = all_pos_0 + i*all_pos_1
528-
# //
529-
# llama_pos all_pos_0; // used if pos == NULL
530-
# llama_pos all_pos_1; // used if pos == NULL
531-
# llama_seq_id all_seq_id; // used if seq_id == NULL
532527
# } llama_batch;
533528
class llama_batch(ctypes.Structure):
534529
"""Input data for llama_decode
@@ -563,9 +558,6 @@ class llama_batch(ctypes.Structure):
563558
("n_seq_id", ctypes.POINTER(ctypes.c_int32)),
564559
("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))),
565560
("logits", ctypes.POINTER(ctypes.c_int8)),
566-
("all_pos_0", llama_pos),
567-
("all_pos_1", llama_pos),
568-
("all_seq_id", llama_seq_id),
569561
]
570562

571563

@@ -1170,6 +1162,12 @@ def llama_supports_gpu_offload() -> bool:
11701162
...
11711163

11721164

1165+
# LLAMA_API bool llama_supports_rpc (void);
1166+
@ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
1167+
def llama_supports_rpc() -> bool:
1168+
...
1169+
1170+
11731171
# LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
11741172
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
11751173
def llama_n_ctx(ctx: llama_context_p, /) -> int:
@@ -2255,30 +2253,26 @@ def llama_state_seq_load_file(
22552253
# //
22562254

22572255

2258-
# // Return batch for single sequence of tokens starting at pos_0
2256+
# // Return batch for single sequence of tokens
2257+
# // The sequence ID will be fixed to 0
2258+
# // The position of the tokens will be tracked automatically by llama_decode
22592259
# //
22602260
# // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
22612261
# //
22622262
# LLAMA_API struct llama_batch llama_batch_get_one(
22632263
# llama_token * tokens,
2264-
# int32_t n_tokens,
2265-
# llama_pos pos_0,
2266-
# llama_seq_id seq_id);
2264+
# int32_t n_tokens);
22672265
@ctypes_function(
22682266
"llama_batch_get_one",
22692267
[
22702268
llama_token_p,
2271-
ctypes.c_int,
2272-
llama_pos,
2273-
llama_seq_id,
2269+
ctypes.c_int32,
22742270
],
22752271
llama_batch,
22762272
)
22772273
def llama_batch_get_one(
22782274
tokens: CtypesArray[llama_token],
22792275
n_tokens: Union[ctypes.c_int, int],
2280-
pos_0: Union[llama_pos, int],
2281-
seq_id: llama_seq_id,
22822276
/,
22832277
) -> llama_batch:
22842278
"""Return batch for single sequence of tokens starting at pos_0
@@ -2616,6 +2610,13 @@ def llama_token_eos(model: llama_model_p, /) -> int:
26162610
...
26172611

26182612

2613+
# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
2614+
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
2615+
def llama_token_eot(model: llama_model_p, /) -> int:
2616+
"""end-of-turn"""
2617+
...
2618+
2619+
26192620
# LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
26202621
@ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
26212622
def llama_token_cls(model: llama_model_p, /) -> int:
@@ -2650,30 +2651,54 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool:
26502651

26512652

26522653
# // Codellama infill tokens
2653-
# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
2654+
# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
26542655
@ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
26552656
def llama_token_prefix(model: llama_model_p) -> int:
26562657
"""codellama infill tokens"""
26572658
...
26582659

26592660

2660-
# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
2661+
# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
26612662
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
26622663
def llama_token_middle(model: llama_model_p, /) -> int:
26632664
...
26642665

26652666

2666-
# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
2667+
# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
26672668
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
26682669
def llama_token_suffix(model: llama_model_p, /) -> int:
26692670
...
26702671

26712672

2672-
# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
2673-
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
2674-
def llama_token_eot(model: llama_model_p, /) -> int:
2673+
# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
2674+
@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token)
2675+
def llama_token_fim_pre(model: llama_model_p, /) -> int:
2676+
...
2677+
2678+
# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
2679+
@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token)
2680+
def llama_token_fim_suf(model: llama_model_p, /) -> int:
26752681
...
26762682

2683+
# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
2684+
@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token)
2685+
def llama_token_fim_mid(model: llama_model_p, /) -> int:
2686+
...
2687+
2688+
# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
2689+
@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token)
2690+
def llama_token_fim_pad(model: llama_model_p, /) -> int:
2691+
...
2692+
2693+
# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
2694+
@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token)
2695+
def llama_token_fim_rep(model: llama_model_p, /) -> int:
2696+
...
2697+
2698+
# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
2699+
@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token)
2700+
def llama_token_fim_sep(model: llama_model_p, /) -> int:
2701+
...
26772702

26782703
# //
26792704
# // Tokenization
@@ -2786,6 +2811,23 @@ def llama_token_to_piece(
27862811
...
27872812

27882813

2814+
# # // check if token0 is contained as a prefix in token1
2815+
# # LLAMA_API bool llama_token_is_prefix(
2816+
# # const struct llama_model * model,
2817+
# # llama_token token0,
2818+
# # llama_token token1);
2819+
# @ctypes_function(
2820+
# "llama_token_is_prefix",
2821+
# [llama_model_p_ctypes, llama_token, llama_token],
2822+
# ctypes.c_bool,
2823+
# )
2824+
# def llama_token_is_prefix(
2825+
# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
2826+
# ) -> bool:
2827+
# """Check if token0 is contained as a prefix in token1"""
2828+
# ...
2829+
2830+
27892831
# /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
27902832
# /// @param text The char pointer must be large enough to hold the resulting text.
27912833
# /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -3099,20 +3141,22 @@ def llama_sampler_chain_remove(
30993141

31003142
# // available samplers:
31013143
#
3102-
# LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
3144+
# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
31033145
@ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
31043146
def llama_sampler_init_greedy() -> llama_sampler_p:
31053147
...
31063148

31073149

3108-
# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
3150+
# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
31093151
@ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
31103152
def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
31113153
...
31123154

31133155

31143156
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
3115-
# LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
3157+
# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
3158+
# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
3159+
# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
31163160
@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
31173161
def llama_sampler_init_softmax() -> llama_sampler_p:
31183162
...
@@ -3188,6 +3232,19 @@ def llama_sampler_init_temp_ext(
31883232
...
31893233

31903234

3235+
# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
3236+
# LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
3237+
@ctypes_function(
3238+
"llama_sampler_init_xtc",
3239+
[ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32],
3240+
llama_sampler_p_ctypes,
3241+
)
3242+
def llama_sampler_init_xtc(
3243+
p: float, t: float, min_keep: int, seed: int, /
3244+
) -> llama_sampler_p:
3245+
...
3246+
3247+
31913248
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
31923249
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
31933250
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3301,6 +3358,39 @@ def llama_sampler_init_logit_bias(
33013358
...
33023359

33033360

3361+
# // this sampler is meant to be used for fill-in-the-middle infilling
3362+
# // it's supposed to be used after top_k + top_p sampling
3363+
# //
3364+
# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
3365+
# // 2. combine probs of tokens that have the same prefix
3366+
# //
3367+
# // example:
3368+
# //
3369+
# // - before:
3370+
# // "hel": 0.5
3371+
# // "hell": 0.2
3372+
# // "hello": 0.1
3373+
# // "dummy": 0.1
3374+
# //
3375+
# // - after:
3376+
# // "hel": 0.8
3377+
# // "dummy": 0.1
3378+
# //
3379+
# // 3. discard non-EOG tokens with low prob
3380+
# // 4. if no tokens are left -> pick EOT
3381+
# //
3382+
# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
3383+
@ctypes_function(
3384+
"llama_sampler_init_infill",
3385+
[llama_model_p_ctypes],
3386+
llama_sampler_p_ctypes,
3387+
)
3388+
def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p:
3389+
"""This sampler is meant to be used for fill-in-the-middle infilling.
3390+
"""
3391+
...
3392+
3393+
33043394
# // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
33053395
# LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
33063396
@ctypes_function(

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.