From c8a5504eb1b7ff17cc13f53ac63eab1be46d801a Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Thu, 13 Jun 2024 10:31:11 -0500
Subject: [PATCH 1/2] reapply changes after sync with main branch

---
 llama_cpp/_internals.py | 216 +++++++++++++++++++++++++++++++++++++++-
 llama_cpp/llama.py      |  10 +-
 2 files changed, 224 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index ee990d474..ad91673cc 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import os
+import sys
+import struct
 import ctypes
 
 from typing import (
@@ -10,6 +12,8 @@
 )
 from dataclasses import dataclass, field
 from contextlib import ExitStack
+from io import BufferedReader
+from enum import IntEnum
 
 import numpy as np
 import numpy.typing as npt
@@ -226,7 +230,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         )
 
     # Extra
-    def metadata(self) -> Dict[str, str]:
+    def _metadata_no_arrays(self) -> Dict[str, str]:
         assert self.model is not None
         metadata: Dict[str, str] = {}
         buffer_size = 1024
@@ -250,6 +254,12 @@ def metadata(self) -> Dict[str, str]:
             metadata[key] = value
         return metadata
 
+    def metadata(self) -> Dict[str, Union[str, int, float, bool, list]]:
+        assert self.model is not None
+        # Uncomment the next line to use the old method
+        #return self._metadata_no_arrays()
+        return QuickGGUFReader.load_metadata(self.path_model)
+
     @staticmethod
     def default_params():
         """Get the default llama_model_params."""
@@ -833,3 +843,207 @@ def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
         if apply_grammar and self.grammar is not None:
             ctx_main.grammar_accept_token(self.grammar, id)
         self.prev.append(id)
+
+class QuickGGUFReader:
+    """
+    All logic in this class is based on the GGUF format specification, which
+    can be found here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+    """
+    # NOTE: Officially, there is no way to determine if a GGUF file is little
+    #       or big endian. The format specifcation directs us to assume that
+    #       a file is little endian in all cases unless additional info is
+    #       provided.
+    #
+    #       In addition to this, GGUF files cannot run on hosts with the
+    #       opposite endianness. And, at this point in the code, the model
+    #       is already loaded. Therefore, we can assume that the endianness
+    #       of the file is the same as the endianness of the host.
+
+    # the GGUF format versions that this class supports
+    SUPPORTED_GGUF_VERSIONS = [2, 3]
+
+    # GGUF only supports execution on little or big endian machines
+    if sys.byteorder not in ['little', 'big']:
+        raise ValueError(
+            "host is not little or big endian - GGUF is unsupported"
+        )
+    
+    # Occasionally check to ensure these values are consistent with
+    # the latest values in llama.cpp/gguf-py/gguf/constants.py
+    class GGUFValueType(IntEnum):
+        UINT8   = 0
+        INT8    = 1
+        UINT16  = 2
+        INT16   = 3
+        UINT32  = 4
+        INT32   = 5
+        FLOAT32 = 6
+        BOOL    = 7
+        STRING  = 8
+        ARRAY   = 9
+        UINT64  = 10
+        INT64   = 11
+        FLOAT64 = 12
+    
+    # arguments for struct.unpack() based on gguf value type
+    value_packing: dict = {
+        GGUFValueType.UINT8:   "=B",
+        GGUFValueType.INT8:    "=b",
+        GGUFValueType.UINT16:  "=H",
+        GGUFValueType.INT16:   "=h",
+        GGUFValueType.UINT32:  "=I",
+        GGUFValueType.INT32:   "=i",
+        GGUFValueType.FLOAT32: "=f",
+        GGUFValueType.UINT64:  "=Q",
+        GGUFValueType.INT64:   "=q",
+        GGUFValueType.FLOAT64: "=d",
+        GGUFValueType.BOOL:    "?"
+    }
+
+    # length in bytes for each gguf value type
+    value_lengths: dict = {
+        GGUFValueType.UINT8:   1,
+        GGUFValueType.INT8:    1,
+        GGUFValueType.UINT16:  2,
+        GGUFValueType.INT16:   2,
+        GGUFValueType.UINT32:  4,
+        GGUFValueType.INT32:   4,
+        GGUFValueType.FLOAT32: 4,
+        GGUFValueType.UINT64:  8,
+        GGUFValueType.INT64:   8,
+        GGUFValueType.FLOAT64: 8,
+        GGUFValueType.BOOL:    1
+    }
+
+    @staticmethod
+    def unpack(value_type: GGUFValueType, file: BufferedReader):
+        return struct.unpack(
+            QuickGGUFReader.value_packing.get(value_type),
+            file.read(QuickGGUFReader.value_lengths.get(value_type))
+        )[0]
+
+    @staticmethod
+    def get_single(
+            value_type: GGUFValueType,
+            file: BufferedReader
+        ) -> Union[str, int, float, bool]:
+        """Read a single value from an open file"""
+        if value_type == QuickGGUFReader.GGUFValueType.STRING:
+            string_length = QuickGGUFReader.unpack(
+                QuickGGUFReader.GGUFValueType.UINT64,
+                file=file
+            )
+            value = file.read(string_length)
+            # officially, strings that cannot be decoded into utf-8 are invalid
+            value = value.decode("utf-8")
+        else:
+            value = QuickGGUFReader.unpack(value_type, file=file)
+        return value
+    
+    @staticmethod
+    def load_metadata(
+            fn: Union[os.PathLike[str], str]
+        ) -> dict[str, Union[str, int, float, bool, list]]:
+        """
+        Given a path to a GGUF file, peek at its header for metadata
+
+        Return a dictionary where all keys are strings, and values can be
+        strings, ints, floats, bools, or lists
+        """
+
+        metadata: dict[str, Union[str, int, float, bool, list]] = {}
+        with open(fn, "rb") as file:
+            magic = file.read(4)
+
+            if magic != b"GGUF":
+                raise ValueError(
+                    "your model file is not a valid GGUF file "
+                    f"(magic number mismatch, got {magic}, "
+                    "expected b'GGUF')"
+                )
+            
+            version = QuickGGUFReader.unpack(
+                QuickGGUFReader.GGUFValueType.UINT32,
+                file=file
+            )
+
+            if version not in QuickGGUFReader.SUPPORTED_GGUF_VERSIONS:
+                raise ValueError(
+                    f"your model file reports GGUF version {version}, but "
+                    f"only versions {QuickGGUFReader.SUPPORTED_GGUF_VERSIONS} "
+                    "are supported. re-convert your model or download a newer "
+                    "version"
+                )
+            
+            tensor_count = QuickGGUFReader.unpack(
+                QuickGGUFReader.GGUFValueType.UINT64,
+                file=file
+            )
+
+            if version == 3:
+                metadata_kv_count = QuickGGUFReader.unpack(
+                    QuickGGUFReader.GGUFValueType.UINT64,
+                    file=file
+                )
+            elif version == 2:
+                metadata_kv_count = QuickGGUFReader.unpack(
+                    QuickGGUFReader.GGUFValueType.UINT32,
+                    file=file
+                )
+
+            for _ in range(metadata_kv_count):
+                if version == 3:
+                    key_length = QuickGGUFReader.unpack(
+                        QuickGGUFReader.GGUFValueType.UINT64,
+                        file=file
+                    )
+                elif version == 2:
+                    key_length = 0
+                    while key_length == 0:
+                        # read until next key is found
+                        key_length = QuickGGUFReader.unpack(
+                            QuickGGUFReader.GGUFValueType.UINT32,
+                            file=file
+                        )
+                    file.read(4) # 4 byte offset for GGUFv2
+                key = file.read(key_length)
+                value_type = QuickGGUFReader.GGUFValueType(
+                    QuickGGUFReader.unpack(
+                        QuickGGUFReader.GGUFValueType.UINT32,
+                        file=file
+                    )
+                )
+                if value_type == QuickGGUFReader.GGUFValueType.ARRAY:
+                    array_value_type = QuickGGUFReader.GGUFValueType(
+                        QuickGGUFReader.unpack(
+                            QuickGGUFReader.GGUFValueType.UINT32,
+                            file=file
+                        )
+                    )
+                    # array_length is the number of items in the array
+                    if version == 3:
+                        array_length = QuickGGUFReader.unpack(
+                            QuickGGUFReader.GGUFValueType.UINT64,
+                            file=file
+                        )
+                    elif version == 2:
+                        array_length = QuickGGUFReader.unpack(
+                            QuickGGUFReader.GGUFValueType.UINT32,
+                            file=file
+                        )
+                        file.read(4) # 4 byte offset for GGUFv2
+                    array = [
+                        QuickGGUFReader.get_single(
+                            array_value_type,
+                            file=file
+                        ) for _ in range(array_length)
+                    ]
+                    metadata[key.decode()] = array
+                else:
+                    value = QuickGGUFReader.get_single(
+                        value_type,
+                        file=file
+                    )
+                    metadata[key.decode()] = value
+
+        return metadata
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 459b29f92..1cf3f0ddf 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -428,7 +428,15 @@ def __init__(
                 print(f"Failed to load metadata: {e}", file=sys.stderr)
 
         if self.verbose:
-            print(f"Model metadata: {self.metadata}", file=sys.stderr)
+            print("Model metadata:", file=sys.stderr)
+            for k, v in self.metadata.items():
+                # only calculate repr() once as it may be slow for large arrays
+                repr_v = repr(v)
+                if len(repr_v) > 63:
+                    # truncate long values
+                    print(f"  {k}: {repr_v[:60]}...", file=sys.stderr)
+                else:
+                    print(f"  {k}: {repr_v}", file=sys.stderr)
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()

From a2b89cb22104441a929a661252df4094ded24412 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Fri, 12 Jul 2024 00:06:44 -0500
Subject: [PATCH 2/2] change repeat_penalty to 1.0 to match llama.cpp defaults

---
 llama_cpp/_internals.py | 218 +---------------------------------------
 llama_cpp/llama.py      |  24 ++---
 2 files changed, 10 insertions(+), 232 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 8c97f7dbc..dcd4e17ff 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
 import os
-import sys
-import struct
 import ctypes
 
 from typing import (
@@ -13,8 +11,6 @@
 )
 from dataclasses import dataclass, field
 from contextlib import ExitStack
-from io import BufferedReader
-from enum import IntEnum
 
 import numpy as np
 import numpy.typing as npt
@@ -238,7 +234,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         )
 
     # Extra
-    def _metadata_no_arrays(self) -> Dict[str, str]:
+    def metadata(self) -> Dict[str, str]:
         assert self.model is not None
         metadata: Dict[str, str] = {}
         buffer_size = 1024
@@ -270,12 +266,6 @@ def _metadata_no_arrays(self) -> Dict[str, str]:
             metadata[key] = value
         return metadata
 
-    def metadata(self) -> Dict[str, Union[str, int, float, bool, list]]:
-        assert self.model is not None
-        # Uncomment the next line to use the old method
-        #return self._metadata_no_arrays()
-        return QuickGGUFReader.load_metadata(self.path_model)
-
     @staticmethod
     def default_params():
         """Get the default llama_model_params."""
@@ -731,7 +721,7 @@ class _LlamaSamplingParams:
     typical_p: float = 1.00
     temp: float = 0.80
     penalty_last_n: int = 64
-    penalty_repeat: float = 1.10
+    penalty_repeat: float = 1.0
     penalty_freq: float = 0.00
     penalty_present: float = 0.00
     mirostat: int = 0
@@ -876,207 +866,3 @@ def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
         if apply_grammar and self.grammar is not None:
             ctx_main.grammar_accept_token(self.grammar, id)
         self.prev.append(id)
-
-class QuickGGUFReader:
-    """
-    All logic in this class is based on the GGUF format specification, which
-    can be found here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
-    """
-    # NOTE: Officially, there is no way to determine if a GGUF file is little
-    #       or big endian. The format specifcation directs us to assume that
-    #       a file is little endian in all cases unless additional info is
-    #       provided.
-    #
-    #       In addition to this, GGUF files cannot run on hosts with the
-    #       opposite endianness. And, at this point in the code, the model
-    #       is already loaded. Therefore, we can assume that the endianness
-    #       of the file is the same as the endianness of the host.
-
-    # the GGUF format versions that this class supports
-    SUPPORTED_GGUF_VERSIONS = [2, 3]
-
-    # GGUF only supports execution on little or big endian machines
-    if sys.byteorder not in ['little', 'big']:
-        raise ValueError(
-            "host is not little or big endian - GGUF is unsupported"
-        )
-    
-    # Occasionally check to ensure these values are consistent with
-    # the latest values in llama.cpp/gguf-py/gguf/constants.py
-    class GGUFValueType(IntEnum):
-        UINT8   = 0
-        INT8    = 1
-        UINT16  = 2
-        INT16   = 3
-        UINT32  = 4
-        INT32   = 5
-        FLOAT32 = 6
-        BOOL    = 7
-        STRING  = 8
-        ARRAY   = 9
-        UINT64  = 10
-        INT64   = 11
-        FLOAT64 = 12
-    
-    # arguments for struct.unpack() based on gguf value type
-    value_packing: dict = {
-        GGUFValueType.UINT8:   "=B",
-        GGUFValueType.INT8:    "=b",
-        GGUFValueType.UINT16:  "=H",
-        GGUFValueType.INT16:   "=h",
-        GGUFValueType.UINT32:  "=I",
-        GGUFValueType.INT32:   "=i",
-        GGUFValueType.FLOAT32: "=f",
-        GGUFValueType.UINT64:  "=Q",
-        GGUFValueType.INT64:   "=q",
-        GGUFValueType.FLOAT64: "=d",
-        GGUFValueType.BOOL:    "?"
-    }
-
-    # length in bytes for each gguf value type
-    value_lengths: dict = {
-        GGUFValueType.UINT8:   1,
-        GGUFValueType.INT8:    1,
-        GGUFValueType.UINT16:  2,
-        GGUFValueType.INT16:   2,
-        GGUFValueType.UINT32:  4,
-        GGUFValueType.INT32:   4,
-        GGUFValueType.FLOAT32: 4,
-        GGUFValueType.UINT64:  8,
-        GGUFValueType.INT64:   8,
-        GGUFValueType.FLOAT64: 8,
-        GGUFValueType.BOOL:    1
-    }
-
-    @staticmethod
-    def unpack(value_type: GGUFValueType, file: BufferedReader):
-        return struct.unpack(
-            QuickGGUFReader.value_packing.get(value_type),
-            file.read(QuickGGUFReader.value_lengths.get(value_type))
-        )[0]
-
-    @staticmethod
-    def get_single(
-            value_type: GGUFValueType,
-            file: BufferedReader
-        ) -> Union[str, int, float, bool]:
-        """Read a single value from an open file"""
-        if value_type == QuickGGUFReader.GGUFValueType.STRING:
-            string_length = QuickGGUFReader.unpack(
-                QuickGGUFReader.GGUFValueType.UINT64,
-                file=file
-            )
-            value = file.read(string_length)
-            # officially, strings that cannot be decoded into utf-8 are invalid
-            value = value.decode("utf-8")
-        else:
-            value = QuickGGUFReader.unpack(value_type, file=file)
-        return value
-    
-    @staticmethod
-    def load_metadata(
-            fn: Union[os.PathLike[str], str]
-        ) -> dict[str, Union[str, int, float, bool, list]]:
-        """
-        Given a path to a GGUF file, peek at its header for metadata
-
-        Return a dictionary where all keys are strings, and values can be
-        strings, ints, floats, bools, or lists
-        """
-
-        metadata: dict[str, Union[str, int, float, bool, list]] = {}
-        with open(fn, "rb") as file:
-            magic = file.read(4)
-
-            if magic != b"GGUF":
-                raise ValueError(
-                    "your model file is not a valid GGUF file "
-                    f"(magic number mismatch, got {magic}, "
-                    "expected b'GGUF')"
-                )
-            
-            version = QuickGGUFReader.unpack(
-                QuickGGUFReader.GGUFValueType.UINT32,
-                file=file
-            )
-
-            if version not in QuickGGUFReader.SUPPORTED_GGUF_VERSIONS:
-                raise ValueError(
-                    f"your model file reports GGUF version {version}, but "
-                    f"only versions {QuickGGUFReader.SUPPORTED_GGUF_VERSIONS} "
-                    "are supported. re-convert your model or download a newer "
-                    "version"
-                )
-            
-            tensor_count = QuickGGUFReader.unpack(
-                QuickGGUFReader.GGUFValueType.UINT64,
-                file=file
-            )
-
-            if version == 3:
-                metadata_kv_count = QuickGGUFReader.unpack(
-                    QuickGGUFReader.GGUFValueType.UINT64,
-                    file=file
-                )
-            elif version == 2:
-                metadata_kv_count = QuickGGUFReader.unpack(
-                    QuickGGUFReader.GGUFValueType.UINT32,
-                    file=file
-                )
-
-            for _ in range(metadata_kv_count):
-                if version == 3:
-                    key_length = QuickGGUFReader.unpack(
-                        QuickGGUFReader.GGUFValueType.UINT64,
-                        file=file
-                    )
-                elif version == 2:
-                    key_length = 0
-                    while key_length == 0:
-                        # read until next key is found
-                        key_length = QuickGGUFReader.unpack(
-                            QuickGGUFReader.GGUFValueType.UINT32,
-                            file=file
-                        )
-                    file.read(4) # 4 byte offset for GGUFv2
-                key = file.read(key_length)
-                value_type = QuickGGUFReader.GGUFValueType(
-                    QuickGGUFReader.unpack(
-                        QuickGGUFReader.GGUFValueType.UINT32,
-                        file=file
-                    )
-                )
-                if value_type == QuickGGUFReader.GGUFValueType.ARRAY:
-                    array_value_type = QuickGGUFReader.GGUFValueType(
-                        QuickGGUFReader.unpack(
-                            QuickGGUFReader.GGUFValueType.UINT32,
-                            file=file
-                        )
-                    )
-                    # array_length is the number of items in the array
-                    if version == 3:
-                        array_length = QuickGGUFReader.unpack(
-                            QuickGGUFReader.GGUFValueType.UINT64,
-                            file=file
-                        )
-                    elif version == 2:
-                        array_length = QuickGGUFReader.unpack(
-                            QuickGGUFReader.GGUFValueType.UINT32,
-                            file=file
-                        )
-                        file.read(4) # 4 byte offset for GGUFv2
-                    array = [
-                        QuickGGUFReader.get_single(
-                            array_value_type,
-                            file=file
-                        ) for _ in range(array_length)
-                    ]
-                    metadata[key.decode()] = array
-                else:
-                    value = QuickGGUFReader.get_single(
-                        value_type,
-                        file=file
-                    )
-                    metadata[key.decode()] = value
-
-        return metadata
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index aceec945f..ded3cf912 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -456,15 +456,7 @@ def __init__(
                 print(f"Failed to load metadata: {e}", file=sys.stderr)
 
         if self.verbose:
-            print("Model metadata:", file=sys.stderr)
-            for k, v in self.metadata.items():
-                # only calculate repr() once as it may be slow for large arrays
-                repr_v = repr(v)
-                if len(repr_v) > 63:
-                    # truncate long values
-                    print(f"  {k}: {repr_v[:60]}...", file=sys.stderr)
-                else:
-                    print(f"  {k}: {repr_v}", file=sys.stderr)
+            print(f"Model metadata: {self.metadata}", file=sys.stderr)
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()
@@ -657,7 +649,7 @@ def sample(
         min_p: float = 0.05,
         typical_p: float = 1.0,
         temp: float = 0.80,
-        repeat_penalty: float = 1.1,
+        repeat_penalty: float = 1.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         tfs_z: float = 1.0,
@@ -732,7 +724,7 @@ def generate(
         min_p: float = 0.05,
         typical_p: float = 1.0,
         temp: float = 0.80,
-        repeat_penalty: float = 1.1,
+        repeat_penalty: float = 1.0,
         reset: bool = True,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
@@ -750,7 +742,7 @@ def generate(
         Examples:
             >>> llama = Llama("models/ggml-7b.bin")
             >>> tokens = llama.tokenize(b"Hello, world!")
-            >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1):
+            >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.0):
             ...     print(llama.detokenize([token]))
 
         Args:
@@ -1019,7 +1011,7 @@ def _create_completion(
         stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
+        repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
         seed: Optional[int] = None,
@@ -1638,7 +1630,7 @@ def create_completion(
         stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
+        repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
         seed: Optional[int] = None,
@@ -1735,7 +1727,7 @@ def __call__(
         stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
+        repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
         seed: Optional[int] = None,
@@ -1832,7 +1824,7 @@ def create_chat_completion(
         max_tokens: Optional[int] = None,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
+        repeat_penalty: float = 1.0,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,