From c8a5504eb1b7ff17cc13f53ac63eab1be46d801a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 13 Jun 2024 10:31:11 -0500 Subject: [PATCH 1/2] reapply changes after sync with main branch --- llama_cpp/_internals.py | 216 +++++++++++++++++++++++++++++++++++++++- llama_cpp/llama.py | 10 +- 2 files changed, 224 insertions(+), 2 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index ee990d474..ad91673cc 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1,6 +1,8 @@ from __future__ import annotations import os +import sys +import struct import ctypes from typing import ( @@ -10,6 +12,8 @@ ) from dataclasses import dataclass, field from contextlib import ExitStack +from io import BufferedReader +from enum import IntEnum import numpy as np import numpy.typing as npt @@ -226,7 +230,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: ) # Extra - def metadata(self) -> Dict[str, str]: + def _metadata_no_arrays(self) -> Dict[str, str]: assert self.model is not None metadata: Dict[str, str] = {} buffer_size = 1024 @@ -250,6 +254,12 @@ def metadata(self) -> Dict[str, str]: metadata[key] = value return metadata + def metadata(self) -> Dict[str, Union[str, int, float, bool, list]]: + assert self.model is not None + # Uncomment the next line to use the old method + #return self._metadata_no_arrays() + return QuickGGUFReader.load_metadata(self.path_model) + @staticmethod def default_params(): """Get the default llama_model_params.""" @@ -833,3 +843,207 @@ def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool): if apply_grammar and self.grammar is not None: ctx_main.grammar_accept_token(self.grammar, id) self.prev.append(id) + +class QuickGGUFReader: + """ + All logic in this class is based on the GGUF format specification, which + can be found here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md + """ + # NOTE: Officially, there is no way to determine if a GGUF file is little + # or big endian. The format specifcation directs us to assume that + # a file is little endian in all cases unless additional info is + # provided. + # + # In addition to this, GGUF files cannot run on hosts with the + # opposite endianness. And, at this point in the code, the model + # is already loaded. Therefore, we can assume that the endianness + # of the file is the same as the endianness of the host. + + # the GGUF format versions that this class supports + SUPPORTED_GGUF_VERSIONS = [2, 3] + + # GGUF only supports execution on little or big endian machines + if sys.byteorder not in ['little', 'big']: + raise ValueError( + "host is not little or big endian - GGUF is unsupported" + ) + + # Occasionally check to ensure these values are consistent with + # the latest values in llama.cpp/gguf-py/gguf/constants.py + class GGUFValueType(IntEnum): + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 + FLOAT32 = 6 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 + FLOAT64 = 12 + + # arguments for struct.unpack() based on gguf value type + value_packing: dict = { + GGUFValueType.UINT8: "=B", + GGUFValueType.INT8: "=b", + GGUFValueType.UINT16: "=H", + GGUFValueType.INT16: "=h", + GGUFValueType.UINT32: "=I", + GGUFValueType.INT32: "=i", + GGUFValueType.FLOAT32: "=f", + GGUFValueType.UINT64: "=Q", + GGUFValueType.INT64: "=q", + GGUFValueType.FLOAT64: "=d", + GGUFValueType.BOOL: "?" + } + + # length in bytes for each gguf value type + value_lengths: dict = { + GGUFValueType.UINT8: 1, + GGUFValueType.INT8: 1, + GGUFValueType.UINT16: 2, + GGUFValueType.INT16: 2, + GGUFValueType.UINT32: 4, + GGUFValueType.INT32: 4, + GGUFValueType.FLOAT32: 4, + GGUFValueType.UINT64: 8, + GGUFValueType.INT64: 8, + GGUFValueType.FLOAT64: 8, + GGUFValueType.BOOL: 1 + } + + @staticmethod + def unpack(value_type: GGUFValueType, file: BufferedReader): + return struct.unpack( + QuickGGUFReader.value_packing.get(value_type), + file.read(QuickGGUFReader.value_lengths.get(value_type)) + )[0] + + @staticmethod + def get_single( + value_type: GGUFValueType, + file: BufferedReader + ) -> Union[str, int, float, bool]: + """Read a single value from an open file""" + if value_type == QuickGGUFReader.GGUFValueType.STRING: + string_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + value = file.read(string_length) + # officially, strings that cannot be decoded into utf-8 are invalid + value = value.decode("utf-8") + else: + value = QuickGGUFReader.unpack(value_type, file=file) + return value + + @staticmethod + def load_metadata( + fn: Union[os.PathLike[str], str] + ) -> dict[str, Union[str, int, float, bool, list]]: + """ + Given a path to a GGUF file, peek at its header for metadata + + Return a dictionary where all keys are strings, and values can be + strings, ints, floats, bools, or lists + """ + + metadata: dict[str, Union[str, int, float, bool, list]] = {} + with open(fn, "rb") as file: + magic = file.read(4) + + if magic != b"GGUF": + raise ValueError( + "your model file is not a valid GGUF file " + f"(magic number mismatch, got {magic}, " + "expected b'GGUF')" + ) + + version = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + + if version not in QuickGGUFReader.SUPPORTED_GGUF_VERSIONS: + raise ValueError( + f"your model file reports GGUF version {version}, but " + f"only versions {QuickGGUFReader.SUPPORTED_GGUF_VERSIONS} " + "are supported. re-convert your model or download a newer " + "version" + ) + + tensor_count = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + + if version == 3: + metadata_kv_count = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + elif version == 2: + metadata_kv_count = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + + for _ in range(metadata_kv_count): + if version == 3: + key_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + elif version == 2: + key_length = 0 + while key_length == 0: + # read until next key is found + key_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + file.read(4) # 4 byte offset for GGUFv2 + key = file.read(key_length) + value_type = QuickGGUFReader.GGUFValueType( + QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + ) + if value_type == QuickGGUFReader.GGUFValueType.ARRAY: + array_value_type = QuickGGUFReader.GGUFValueType( + QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + ) + # array_length is the number of items in the array + if version == 3: + array_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + elif version == 2: + array_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + file.read(4) # 4 byte offset for GGUFv2 + array = [ + QuickGGUFReader.get_single( + array_value_type, + file=file + ) for _ in range(array_length) + ] + metadata[key.decode()] = array + else: + value = QuickGGUFReader.get_single( + value_type, + file=file + ) + metadata[key.decode()] = value + + return metadata diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 459b29f92..1cf3f0ddf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -428,7 +428,15 @@ def __init__( print(f"Failed to load metadata: {e}", file=sys.stderr) if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + print("Model metadata:", file=sys.stderr) + for k, v in self.metadata.items(): + # only calculate repr() once as it may be slow for large arrays + repr_v = repr(v) + if len(repr_v) > 63: + # truncate long values + print(f" {k}: {repr_v[:60]}...", file=sys.stderr) + else: + print(f" {k}: {repr_v}", file=sys.stderr) eos_token_id = self.token_eos() bos_token_id = self.token_bos() From a2b89cb22104441a929a661252df4094ded24412 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Jul 2024 00:06:44 -0500 Subject: [PATCH 2/2] change repeat_penalty to 1.0 to match llama.cpp defaults --- llama_cpp/_internals.py | 218 +--------------------------------------- llama_cpp/llama.py | 24 ++--- 2 files changed, 10 insertions(+), 232 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 8c97f7dbc..dcd4e17ff 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1,8 +1,6 @@ from __future__ import annotations import os -import sys -import struct import ctypes from typing import ( @@ -13,8 +11,6 @@ ) from dataclasses import dataclass, field from contextlib import ExitStack -from io import BufferedReader -from enum import IntEnum import numpy as np import numpy.typing as npt @@ -238,7 +234,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: ) # Extra - def _metadata_no_arrays(self) -> Dict[str, str]: + def metadata(self) -> Dict[str, str]: assert self.model is not None metadata: Dict[str, str] = {} buffer_size = 1024 @@ -270,12 +266,6 @@ def _metadata_no_arrays(self) -> Dict[str, str]: metadata[key] = value return metadata - def metadata(self) -> Dict[str, Union[str, int, float, bool, list]]: - assert self.model is not None - # Uncomment the next line to use the old method - #return self._metadata_no_arrays() - return QuickGGUFReader.load_metadata(self.path_model) - @staticmethod def default_params(): """Get the default llama_model_params.""" @@ -731,7 +721,7 @@ class _LlamaSamplingParams: typical_p: float = 1.00 temp: float = 0.80 penalty_last_n: int = 64 - penalty_repeat: float = 1.10 + penalty_repeat: float = 1.0 penalty_freq: float = 0.00 penalty_present: float = 0.00 mirostat: int = 0 @@ -876,207 +866,3 @@ def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool): if apply_grammar and self.grammar is not None: ctx_main.grammar_accept_token(self.grammar, id) self.prev.append(id) - -class QuickGGUFReader: - """ - All logic in this class is based on the GGUF format specification, which - can be found here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md - """ - # NOTE: Officially, there is no way to determine if a GGUF file is little - # or big endian. The format specifcation directs us to assume that - # a file is little endian in all cases unless additional info is - # provided. - # - # In addition to this, GGUF files cannot run on hosts with the - # opposite endianness. And, at this point in the code, the model - # is already loaded. Therefore, we can assume that the endianness - # of the file is the same as the endianness of the host. - - # the GGUF format versions that this class supports - SUPPORTED_GGUF_VERSIONS = [2, 3] - - # GGUF only supports execution on little or big endian machines - if sys.byteorder not in ['little', 'big']: - raise ValueError( - "host is not little or big endian - GGUF is unsupported" - ) - - # Occasionally check to ensure these values are consistent with - # the latest values in llama.cpp/gguf-py/gguf/constants.py - class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 - FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 - FLOAT64 = 12 - - # arguments for struct.unpack() based on gguf value type - value_packing: dict = { - GGUFValueType.UINT8: "=B", - GGUFValueType.INT8: "=b", - GGUFValueType.UINT16: "=H", - GGUFValueType.INT16: "=h", - GGUFValueType.UINT32: "=I", - GGUFValueType.INT32: "=i", - GGUFValueType.FLOAT32: "=f", - GGUFValueType.UINT64: "=Q", - GGUFValueType.INT64: "=q", - GGUFValueType.FLOAT64: "=d", - GGUFValueType.BOOL: "?" - } - - # length in bytes for each gguf value type - value_lengths: dict = { - GGUFValueType.UINT8: 1, - GGUFValueType.INT8: 1, - GGUFValueType.UINT16: 2, - GGUFValueType.INT16: 2, - GGUFValueType.UINT32: 4, - GGUFValueType.INT32: 4, - GGUFValueType.FLOAT32: 4, - GGUFValueType.UINT64: 8, - GGUFValueType.INT64: 8, - GGUFValueType.FLOAT64: 8, - GGUFValueType.BOOL: 1 - } - - @staticmethod - def unpack(value_type: GGUFValueType, file: BufferedReader): - return struct.unpack( - QuickGGUFReader.value_packing.get(value_type), - file.read(QuickGGUFReader.value_lengths.get(value_type)) - )[0] - - @staticmethod - def get_single( - value_type: GGUFValueType, - file: BufferedReader - ) -> Union[str, int, float, bool]: - """Read a single value from an open file""" - if value_type == QuickGGUFReader.GGUFValueType.STRING: - string_length = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT64, - file=file - ) - value = file.read(string_length) - # officially, strings that cannot be decoded into utf-8 are invalid - value = value.decode("utf-8") - else: - value = QuickGGUFReader.unpack(value_type, file=file) - return value - - @staticmethod - def load_metadata( - fn: Union[os.PathLike[str], str] - ) -> dict[str, Union[str, int, float, bool, list]]: - """ - Given a path to a GGUF file, peek at its header for metadata - - Return a dictionary where all keys are strings, and values can be - strings, ints, floats, bools, or lists - """ - - metadata: dict[str, Union[str, int, float, bool, list]] = {} - with open(fn, "rb") as file: - magic = file.read(4) - - if magic != b"GGUF": - raise ValueError( - "your model file is not a valid GGUF file " - f"(magic number mismatch, got {magic}, " - "expected b'GGUF')" - ) - - version = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT32, - file=file - ) - - if version not in QuickGGUFReader.SUPPORTED_GGUF_VERSIONS: - raise ValueError( - f"your model file reports GGUF version {version}, but " - f"only versions {QuickGGUFReader.SUPPORTED_GGUF_VERSIONS} " - "are supported. re-convert your model or download a newer " - "version" - ) - - tensor_count = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT64, - file=file - ) - - if version == 3: - metadata_kv_count = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT64, - file=file - ) - elif version == 2: - metadata_kv_count = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT32, - file=file - ) - - for _ in range(metadata_kv_count): - if version == 3: - key_length = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT64, - file=file - ) - elif version == 2: - key_length = 0 - while key_length == 0: - # read until next key is found - key_length = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT32, - file=file - ) - file.read(4) # 4 byte offset for GGUFv2 - key = file.read(key_length) - value_type = QuickGGUFReader.GGUFValueType( - QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT32, - file=file - ) - ) - if value_type == QuickGGUFReader.GGUFValueType.ARRAY: - array_value_type = QuickGGUFReader.GGUFValueType( - QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT32, - file=file - ) - ) - # array_length is the number of items in the array - if version == 3: - array_length = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT64, - file=file - ) - elif version == 2: - array_length = QuickGGUFReader.unpack( - QuickGGUFReader.GGUFValueType.UINT32, - file=file - ) - file.read(4) # 4 byte offset for GGUFv2 - array = [ - QuickGGUFReader.get_single( - array_value_type, - file=file - ) for _ in range(array_length) - ] - metadata[key.decode()] = array - else: - value = QuickGGUFReader.get_single( - value_type, - file=file - ) - metadata[key.decode()] = value - - return metadata diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index aceec945f..ded3cf912 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -456,15 +456,7 @@ def __init__( print(f"Failed to load metadata: {e}", file=sys.stderr) if self.verbose: - print("Model metadata:", file=sys.stderr) - for k, v in self.metadata.items(): - # only calculate repr() once as it may be slow for large arrays - repr_v = repr(v) - if len(repr_v) > 63: - # truncate long values - print(f" {k}: {repr_v[:60]}...", file=sys.stderr) - else: - print(f" {k}: {repr_v}", file=sys.stderr) + print(f"Model metadata: {self.metadata}", file=sys.stderr) eos_token_id = self.token_eos() bos_token_id = self.token_bos() @@ -657,7 +649,7 @@ def sample( min_p: float = 0.05, typical_p: float = 1.0, temp: float = 0.80, - repeat_penalty: float = 1.1, + repeat_penalty: float = 1.0, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, tfs_z: float = 1.0, @@ -732,7 +724,7 @@ def generate( min_p: float = 0.05, typical_p: float = 1.0, temp: float = 0.80, - repeat_penalty: float = 1.1, + repeat_penalty: float = 1.0, reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, @@ -750,7 +742,7 @@ def generate( Examples: >>> llama = Llama("models/ggml-7b.bin") >>> tokens = llama.tokenize(b"Hello, world!") - >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): + >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.0): ... print(llama.detokenize([token])) Args: @@ -1019,7 +1011,7 @@ def _create_completion( stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, + repeat_penalty: float = 1.0, top_k: int = 40, stream: bool = False, seed: Optional[int] = None, @@ -1638,7 +1630,7 @@ def create_completion( stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, + repeat_penalty: float = 1.0, top_k: int = 40, stream: bool = False, seed: Optional[int] = None, @@ -1735,7 +1727,7 @@ def __call__( stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, + repeat_penalty: float = 1.0, top_k: int = 40, stream: bool = False, seed: Optional[int] = None, @@ -1832,7 +1824,7 @@ def create_chat_completion( max_tokens: Optional[int] = None, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, + repeat_penalty: float = 1.0, tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0,