Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 5502ac8

Browse filesBrowse files
committed
Update llama.cpp
1 parent 359ae73 commit 5502ac8
Copy full SHA for 5502ac8

File tree

Expand file treeCollapse file tree

2 files changed

+36
-9
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+36
-9
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+35-8Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ def _load_shared_library(lib_base_name: str):
229229
LLAMA_SPLIT_LAYER = 1
230230
LLAMA_SPLIT_ROW = 2
231231

232+
232233
# typedef struct llama_token_data {
233234
# llama_token id; // token id
234235
# float logit; // log-odds of the token
@@ -395,6 +396,7 @@ class llama_model_kv_override(Structure):
395396
# // override key-value pairs of the model meta data
396397
# const struct llama_model_kv_override * kv_overrides;
397398

399+
398400
# // Keep the booleans together to avoid misalignment during copy-by-value.
399401
# bool vocab_only; // only load the vocabulary, no weights
400402
# bool use_mmap; // use mmap if possible
@@ -407,7 +409,7 @@ class llama_model_params(Structure):
407409
n_gpu_layers (int): number of layers to store in VRAM
408410
split_mode (int): how to split the model across multiple GPUs
409411
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
410-
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
412+
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
411413
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
412414
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
413415
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -1960,14 +1962,39 @@ def llama_sample_repetition_penalties(
19601962

19611963

19621964
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
1963-
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
1964-
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
1965-
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
1966-
# LLAMA_API void llama_sample_classifier_free_guidance(
1967-
# struct llama_context * ctx,
1965+
# /// @param logits Logits extracted from the original generation context.
1966+
# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
1967+
# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
1968+
# LLAMA_API void llama_sample_apply_guidance(
1969+
# struct llama_context * ctx,
1970+
# float * logits,
1971+
# float * logits_guidance,
1972+
# float scale);
1973+
def llama_sample_apply_guidance(
1974+
ctx: llama_context_p,
1975+
logits, # type: _Pointer[c_float]
1976+
logits_guidance, # type: _Pointer[c_float]
1977+
scale: Union[c_float, float],
1978+
):
1979+
"""Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
1980+
return _lib.llama_sample_apply_guidance(ctx, logits, logits_guidance, scale)
1981+
1982+
1983+
_lib.llama_sample_apply_guidance.argtypes = [
1984+
llama_context_p,
1985+
c_float_p,
1986+
c_float_p,
1987+
c_float,
1988+
]
1989+
_lib.llama_sample_apply_guidance.restype = None
1990+
1991+
1992+
# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
1993+
# struct llama_context * ctx,
19681994
# llama_token_data_array * candidates,
1969-
# struct llama_context * guidance_ctx,
1970-
# float scale);
1995+
# struct llama_context * guidance_ctx,
1996+
# float scale),
1997+
# "use llama_sample_apply_guidance() instead");
19711998
def llama_sample_classifier_free_guidance(
19721999
ctx: llama_context_p,
19732000
candidates, # type: _Pointer[llama_token_data_array]

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.