Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 456a601

Browse filesBrowse files
committed
Merge branch 'main' into batch-processing
2 parents e1cd61e + f015966 commit 456a601
Copy full SHA for 456a601

File tree

Expand file treeCollapse file tree

6 files changed

+86
-22
lines changed
Filter options
Expand file treeCollapse file tree

6 files changed

+86
-22
lines changed

‎CHANGELOG.md

Copy file name to clipboardExpand all lines: CHANGELOG.md
+6Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.28]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
13+
- feat: Add ability to pass in penalize_nl param by @shankinson in #1068
14+
- fix: print_grammar to stderr by @turian in #1052
15+
1016
## [0.2.27]
1117

1218
- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a

‎llama_cpp/__init__.py

Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.27"
4+
__version__ = "0.2.28"

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@ def generate(
518518
mirostat_mode: int = 0,
519519
mirostat_tau: float = 5.0,
520520
mirostat_eta: float = 0.1,
521+
penalize_nl: bool = True,
521522
logits_processor: Optional[LogitsProcessorList] = None,
522523
stopping_criteria: Optional[StoppingCriteriaList] = None,
523524
grammar: Optional[LlamaGrammar] = None,
@@ -578,6 +579,7 @@ def generate(
578579
mirostat_eta=mirostat_eta,
579580
logits_processor=logits_processor,
580581
grammar=grammar,
582+
penalize_nl=penalize_nl,
581583
)
582584
if stopping_criteria is not None and stopping_criteria(
583585
self._input_ids, self._scores[-1, :]

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+75-19Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def _load_shared_library(lib_base_name: str):
104104
# define LLAMA_MAX_RNG_STATE (64*1024)
105105
LLAMA_MAX_RNG_STATE = 64 * 1024
106106

107-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
107+
# define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
108108
LLAMA_FILE_MAGIC_GGLA = 0x67676C61
109109

110110
# define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
@@ -179,6 +179,7 @@ def _load_shared_library(lib_base_name: str):
179179
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
180180
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
181181
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
182+
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
182183

183184
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
184185
# };
@@ -222,11 +223,12 @@ def _load_shared_library(lib_base_name: str):
222223
# } llama_token_data;
223224
class llama_token_data(Structure):
224225
"""Used to store token data
225-
226+
226227
Attributes:
227228
id (llama_token): token id
228229
logit (float): log-odds of the token
229230
p (float): probability of the token"""
231+
230232
_fields_ = [
231233
("id", llama_token),
232234
("logit", c_float),
@@ -244,11 +246,12 @@ class llama_token_data(Structure):
244246
# } llama_token_data_array;
245247
class llama_token_data_array(Structure):
246248
"""Used to sample tokens given logits
247-
249+
248250
Attributes:
249251
data (ctypes.Array[llama_token_data]): token data
250252
size (int): size of the array
251253
sorted (bool): whether the array is sorted"""
254+
252255
_fields_ = [
253256
("data", llama_token_data_p),
254257
("size", c_size_t),
@@ -303,7 +306,8 @@ class llama_batch(Structure):
303306
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
304307
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
305308
pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
306-
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs"""
309+
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
310+
"""
307311

308312
_fields_ = [
309313
("n_tokens", c_int32),
@@ -318,6 +322,7 @@ class llama_batch(Structure):
318322
("all_seq_id", llama_seq_id),
319323
]
320324

325+
321326
# enum llama_model_kv_override_type {
322327
# LLAMA_KV_OVERRIDE_INT,
323328
# LLAMA_KV_OVERRIDE_FLOAT,
@@ -327,6 +332,7 @@ class llama_batch(Structure):
327332
LLAMA_KV_OVERRIDE_FLOAT = 1
328333
LLAMA_KV_OVERRIDE_BOOL = 2
329334

335+
330336
# struct llama_model_kv_override {
331337
# char key[128];
332338
# enum llama_model_kv_override_type tag;
@@ -343,13 +349,15 @@ class llama_model_kv_override_value(CtypesUnion):
343349
("bool_value", c_bool),
344350
]
345351

352+
346353
class llama_model_kv_override(Structure):
347354
_fields_ = [
348355
("key", ctypes.c_char * 128),
349356
("tag", c_int),
350357
("value", llama_model_kv_override_value),
351358
]
352359

360+
353361
# struct llama_model_params {
354362
# int32_t n_gpu_layers; // number of layers to store in VRAM
355363
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
@@ -365,14 +373,15 @@ class llama_model_kv_override(Structure):
365373
# // override key-value pairs of the model meta data
366374
# const struct llama_model_kv_override * kv_overrides;
367375

376+
368377
# // Keep the booleans together to avoid misalignment during copy-by-value.
369378
# bool vocab_only; // only load the vocabulary, no weights
370379
# bool use_mmap; // use mmap if possible
371380
# bool use_mlock; // force system to keep model in RAM
372381
# };
373382
class llama_model_params(Structure):
374383
"""Parameters for llama_model
375-
384+
376385
Attributes:
377386
n_gpu_layers (int): number of layers to store in VRAM
378387
main_gpu (int): the GPU that is used for scratch and small tensors
@@ -383,6 +392,7 @@ class llama_model_params(Structure):
383392
vocab_only (bool): only load the vocabulary, no weights
384393
use_mmap (bool): use mmap if possible
385394
use_mlock (bool): force system to keep model in RAM"""
395+
386396
_fields_ = [
387397
("n_gpu_layers", c_int32),
388398
("main_gpu", c_int32),
@@ -416,6 +426,7 @@ class llama_model_params(Structure):
416426
# enum ggml_type type_k; // data type for K cache
417427
# enum ggml_type type_v; // data type for V cache
418428

429+
419430
# // Keep the booleans together to avoid misalignment during copy-by-value.
420431
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
421432
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
@@ -424,7 +435,7 @@ class llama_model_params(Structure):
424435
# };
425436
class llama_context_params(Structure):
426437
"""Parameters for llama_context
427-
438+
428439
Attributes:
429440
seed (int): RNG seed, -1 for random
430441
n_ctx (int): text context, 0 = from model
@@ -444,7 +455,9 @@ class llama_context_params(Structure):
444455
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
445456
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
446457
embedding (bool): embedding mode only
447-
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
458+
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
459+
"""
460+
448461
_fields_ = [
449462
("seed", c_uint32),
450463
("n_ctx", c_uint32),
@@ -493,14 +506,16 @@ class llama_context_params(Structure):
493506
# } llama_model_quantize_params;
494507
class llama_model_quantize_params(Structure):
495508
"""Parameters for llama_model_quantize
496-
509+
497510
Attributes:
498511
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
499512
ftype (int): quantize to this llama_ftype
500513
allow_requantize (bool): allow quantizing non-f32/f16 tensors
501514
quantize_output_tensor (bool): quantize output.weight
502515
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
503-
pure (bool): disable k-quant mixtures and quantize all tensors to the same type"""
516+
pure (bool): disable k-quant mixtures and quantize all tensors to the same type
517+
"""
518+
504519
_fields_ = [
505520
("nthread", c_int32),
506521
("ftype", c_int),
@@ -745,13 +760,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
745760
_lib.llama_n_ctx.argtypes = [llama_context_p]
746761
_lib.llama_n_ctx.restype = c_uint32
747762

763+
748764
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
749765
def llama_n_batch(ctx: llama_context_p) -> int:
750766
return _lib.llama_n_batch(ctx)
751767

768+
752769
_lib.llama_n_batch.argtypes = [llama_context_p]
753770
_lib.llama_n_batch.restype = c_uint32
754771

772+
755773
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
756774
def llama_vocab_type(model: llama_model_p) -> int:
757775
return _lib.llama_vocab_type(model)
@@ -1080,7 +1098,7 @@ def llama_kv_cache_view_init(
10801098

10811099
# // Free a KV cache view. (use only for debugging purposes)
10821100
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
1083-
def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
1101+
def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
10841102
"""Free a KV cache view. (use only for debugging purposes)"""
10851103
return _lib.llama_kv_cache_view_free(view)
10861104

@@ -1091,7 +1109,7 @@ def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # typ
10911109

10921110
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
10931111
# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
1094-
def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
1112+
def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
10951113
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
10961114
return _lib.llama_kv_cache_view_update(ctx, view)
10971115

@@ -1251,6 +1269,40 @@ def llama_kv_cache_seq_shift(
12511269
]
12521270
_lib.llama_kv_cache_seq_shift.restype = None
12531271

1272+
1273+
# // Integer division of the positions by factor of `d > 1`
1274+
# // If the KV cache is RoPEd, the KV data is updated accordingly
1275+
# // p0 < 0 : [0, p1]
1276+
# // p1 < 0 : [p0, inf)
1277+
# LLAMA_API void llama_kv_cache_seq_div(
1278+
# struct llama_context * ctx,
1279+
# llama_seq_id seq_id,
1280+
# llama_pos p0,
1281+
# llama_pos p1,
1282+
# int d);
1283+
def llama_kv_cache_seq_div(
1284+
ctx: llama_context_p,
1285+
seq_id: Union[llama_seq_id, int],
1286+
p0: Union[llama_pos, int],
1287+
p1: Union[llama_pos, int],
1288+
d: Union[c_int, int],
1289+
):
1290+
"""Integer division of the positions by factor of `d > 1`
1291+
If the KV cache is RoPEd, the KV data is updated accordingly
1292+
p0 < 0 : [0, p1]
1293+
p1 < 0 : [p0, inf)"""
1294+
return _lib.llama_kv_cache_seq_div(ctx, seq_id, p0, p1, d)
1295+
1296+
1297+
_lib.llama_kv_cache_seq_div.argtypes = [
1298+
llama_context_p,
1299+
llama_seq_id,
1300+
llama_pos,
1301+
llama_pos,
1302+
c_int,
1303+
]
1304+
_lib.llama_kv_cache_seq_div.restype = None
1305+
12541306
# //
12551307
# // State / sessions
12561308
# //
@@ -2063,10 +2115,11 @@ def llama_sample_temp(
20632115
temp: Union[c_float, float],
20642116
):
20652117
"""Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509
2066-
2118+
20672119
Parameters:
20682120
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
2069-
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text."""
2121+
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
2122+
"""
20702123
return _lib.llama_sample_temp(ctx, candidates, temp)
20712124

20722125

@@ -2111,10 +2164,11 @@ def llama_sample_grammar(
21112164
grammar, # type: llama_grammar_p
21122165
):
21132166
"""Apply constraints from grammar
2114-
2167+
21152168
Parameters:
21162169
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
2117-
grammar: A grammar object containing the rules and constraints to apply to the generated text."""
2170+
grammar: A grammar object containing the rules and constraints to apply to the generated text.
2171+
"""
21182172
return _lib.llama_sample_grammar(ctx, candidates, grammar)
21192173

21202174

@@ -2148,13 +2202,14 @@ def llama_sample_token_mirostat(
21482202
mu, # type: _Pointer[c_float]
21492203
) -> int:
21502204
"""Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
2151-
2205+
21522206
Parameters:
21532207
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
21542208
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
21552209
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
21562210
m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
2157-
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
2211+
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
2212+
"""
21582213
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
21592214

21602215

@@ -2188,12 +2243,13 @@ def llama_sample_token_mirostat_v2(
21882243
mu, # type: _Pointer[c_float]
21892244
) -> int:
21902245
"""Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
2191-
2246+
21922247
Parameters:
21932248
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
21942249
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
21952250
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
2196-
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
2251+
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
2252+
"""
21972253
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
21982254

21992255

‎llama_cpp/llama_grammar.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_grammar.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar":
7272
)
7373
if verbose:
7474
print(f"{cls.from_string.__name__} grammar:", file=sys.stderr)
75-
print_grammar(sys.stdout, parsed_grammar)
75+
print_grammar(sys.stderr, parsed_grammar)
7676
print(file=sys.stderr)
7777
return cls(parsed_grammar)
7878

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.