Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e325a83

Browse filesBrowse files
committed
feat: Update llama.cpp
1 parent c89be28 commit e325a83
Copy full SHA for e325a83

File tree

Expand file treeCollapse file tree

2 files changed

+57
-9
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+57
-9
lines changed

‎llama_cpp/llama_cpp.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+56-8Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -668,30 +668,36 @@ class llama_context_params(ctypes.Structure):
668668

669669
# // model quantization parameters
670670
# typedef struct llama_model_quantize_params {
671-
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
672-
# enum llama_ftype ftype; // quantize to this llama_ftype
673-
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
674-
# bool quantize_output_tensor; // quantize output.weight
675-
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
676-
# bool pure; // quantize all tensors to the default type
677-
# void * imatrix; // pointer to importance matrix data
671+
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
672+
# enum llama_ftype ftype; // quantize to this llama_ftype
673+
# enum ggml_type output_tensor_type; // output tensor type
674+
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
675+
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
676+
# bool quantize_output_tensor; // quantize output.weight
677+
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
678+
# bool pure; // quantize all tensors to the default type
679+
# void * imatrix; // pointer to importance matrix data
678680
# } llama_model_quantize_params;
679681
class llama_model_quantize_params(ctypes.Structure):
680682
"""Parameters for llama_model_quantize
681683
682684
Attributes:
683685
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
684686
ftype (int): quantize to this llama_ftype
687+
output_tensor_type (int): output tensor type
688+
token_embedding_type (int): itoken embeddings tensor type
685689
allow_requantize (bool): allow quantizing non-f32/f16 tensors
686690
quantize_output_tensor (bool): quantize output.weight
687691
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
688692
pure (bool): quantize all tensors to the default type
689-
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
693+
imatrix (ctypes.c_void_p): pointer to importance matrix data
690694
"""
691695

692696
_fields_ = [
693697
("nthread", ctypes.c_int32),
694698
("ftype", ctypes.c_int),
699+
("output_tensor_type", ctypes.c_int),
700+
("token_embedding_type", ctypes.c_int),
695701
("allow_requantize", ctypes.c_bool),
696702
("quantize_output_tensor", ctypes.c_bool),
697703
("only_copy", ctypes.c_bool),
@@ -2743,6 +2749,48 @@ def llama_beam_search(
27432749
): ...
27442750

27452751

2752+
# /// @details Build a split GGUF final path for this chunk.
2753+
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
2754+
# // Returns the split_path length.
2755+
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
2756+
@ctypes_function(
2757+
"llama_split_path",
2758+
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
2759+
ctypes.c_int,
2760+
)
2761+
def llama_split_path(
2762+
split_path: bytes,
2763+
maxlen: Union[ctypes.c_size_t, int],
2764+
path_prefix: bytes,
2765+
split_no: Union[ctypes.c_int, int],
2766+
split_count: Union[ctypes.c_int, int],
2767+
/,
2768+
) -> int:
2769+
"""Build a split GGUF final path for this chunk."""
2770+
...
2771+
2772+
2773+
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
2774+
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
2775+
# // Returns the split_prefix length.
2776+
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
2777+
@ctypes_function(
2778+
"llama_split_prefix",
2779+
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
2780+
ctypes.c_int,
2781+
)
2782+
def llama_split_prefix(
2783+
split_prefix: bytes,
2784+
maxlen: Union[ctypes.c_size_t, int],
2785+
split_path: bytes,
2786+
split_no: Union[ctypes.c_int, int],
2787+
split_count: Union[ctypes.c_int, int],
2788+
/,
2789+
) -> int:
2790+
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
2791+
...
2792+
2793+
27462794
# Performance information
27472795

27482796

‎vendor/llama.cpp

Copy file name to clipboard

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.