From 5903e2f15edb189af7bc83858ea05d8c9191683f Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Wed, 10 Jan 2024 09:16:59 +0000
Subject: [PATCH 01/27] convert functionary-v1 chat handler to use hf
 autotokenizer

---
 llama_cpp/llama_chat_format.py | 86 +++++++++++-----------------------
 1 file changed, 28 insertions(+), 58 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 6c274aa82..ed1191c6d 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -927,8 +927,8 @@ def format_saiga(
     return ChatFormatterResponse(prompt=_prompt.strip())
 
 
-@register_chat_completion_handler("functionary")
-def functionary_chat_handler(
+@register_chat_completion_handler("functionary-v1")
+def functionary_v1_chat_handler(
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
@@ -957,6 +957,12 @@ def functionary_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+    END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
+    END_USER_TOKEN = "<|END_OF_USER|>"
+    END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
+    END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
+    START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
+    END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
 
     def generate_type_definition(
         param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
@@ -1028,22 +1034,23 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
             parameters = function.get("parameters", {})
             required_params = parameters.get("required", [])
 
-            schema += f"  // {description}\n"
-            schema += f"  type {function_name} = (_: {{\n"
+            schema += f"// {description}\n"
+            schema += f"type {function_name} = (_: {{\n"
 
             for param_name, param in parameters.get("properties", {}).items():
                 param_description = param.get("description", "")
                 param_type = generate_type_definition(param, 2, shared_definitions)
                 optional_indicator = "" if param_name in required_params else "?"
-                schema += f"    // {param_description}\n"
-                schema += f"    {param_name}{optional_indicator}: {param_type},\n"
-            schema += "  }) => any;\n\n"
+                schema += f"// {param_description}\n"
+                schema += f"{param_name}{optional_indicator}: {param_type},\n"
+            schema += "}) => any;\n\n"
 
-        schema += "}} // namespace {}\n".format(namespace)
+        schema += "}} // namespace {}".format(namespace)
         return schema
 
     def prepare_messages_for_inference(
         messages: List[llama_types.ChatCompletionRequestMessage],
+        tokenizer: AutoTokenizer,
         functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
     ):
@@ -1054,8 +1061,7 @@ def prepare_messages_for_inference(
                     role="system", content=generate_schema_from_functions(functions)
                 )
             )
-
-        if tools is not None:
+        elif tools is not None:
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
                     role="system",
@@ -1085,49 +1091,8 @@ def prepare_messages_for_inference(
                     "name"
                 ] = f"functions.{message['function_call']['name']}"
             all_messages.append(message)
-
-        all_messages.append(
-            llama_types.ChatCompletionRequestAssistantMessage(
-                role="assistant", content=None
-            )
-        )
-
-        def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
-            if msg["role"] == "system":
-                return f"system:\n{msg['content']}\n"
-
-            elif msg["role"] == "function" and "name" in msg:
-                return f"function name={msg['name']}:\n{msg['content']}\n"
-            elif msg["role"] == "function" and "function_call" in msg:
-                return f"function name={msg['function_call']['name']}:\n{msg['function_call']['arguments']}\n"
-            elif msg["role"] == "tool":
-                if msg["content"] is not None:
-                    return f"function name={msg['tool_call_id']}:\n{msg['content']}\n"
-                else:
-                    return f"function name={msg['tool_call_id']}\n"
-            elif msg["role"] == "user":
-                if msg["content"] is None:
-                    return "user:\n</s></s>\n"
-                else:
-                    return f"user:\n</s>{msg['content']}</s>\n"
-            elif msg["role"] == "assistant":
-                if msg["content"] is not None and "function_call" in msg:
-                    return f"assistant:\n{msg['content']}\nassistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
-                elif "function_call" in msg:
-                    return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
-                elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
-                    for tool_call in msg[
-                        "tool_calls"
-                    ]:  # NOTE: probably doesn't work with the functionary model
-                        return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
-                elif msg["content"] is None:
-                    return "assistant"
-                else:
-                    return f"assistant:\n{msg['content']}\n"
-            else:
-                raise ValueError(f"Unsupported role: {msg['role']}")
-
-        return "".join([message_to_str(msg) for msg in all_messages])
+        
+        return tokenizer.apply_chat_template(all_messages, tokenize=False) + "assistant:\n"
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1136,19 +1101,24 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         function_call = (
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
+        
+    from transformers import AutoTokenizer
+    
+    tokenizer_path = os.path.dirname(llama.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 
-    prompt = prepare_messages_for_inference(messages, functions, tools)
+    prompt = prepare_messages_for_inference(messages, tokenizer, functions, tools)
 
     if function_call is None and (functions is None or len(functions) == 0):
         completion_or_completion_chunks = llama.create_completion(
-            prompt=prompt + ":\n",
+            prompt=prompt,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stop=["user:", "</s>"],
+            stop=["user:", END_ASSISTANT_TOKEN],
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
@@ -1166,9 +1136,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
     ):
-        stop = "\n"
+        stop = [END_ASSISTANT_TOKEN, END_FUNCTION_CALL_TOKEN]
         completion: llama_types.Completion = llama.create_completion(
-            prompt=prompt, stop=stop, stream=False
+            prompt=prompt + ":\n", stop=stop, stream=False, max_tokens=max_tokens
         )  # type: ignore
         completion_text = completion["choices"][0]["text"]
         # strip " to=functions." and ending ":"

From 485f129ca5d414c8ad32f7ffd85f424976cd0dc8 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 08:46:02 +0000
Subject: [PATCH 02/27] add hf_tokenizer + inteegrate functionary-v1.4 prompt
 template

---
 llama_cpp/llama.py             |  66 ++++++++++------
 llama_cpp/llama_chat_format.py | 136 +++++++++++++++++++++------------
 2 files changed, 129 insertions(+), 73 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 5c66bcf09..2fdd1b6e3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -14,6 +14,7 @@
     Iterator,
     Deque,
     Callable,
+    Any,
 )
 from collections import deque
 
@@ -746,11 +747,24 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        hf_tokenizer: Optional[Any] = None,
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
+        
+        def tokenize(input, hf_tokenizer):
+            if hf_tokenizer is not None:
+                return hf_tokenizer.encode(input)
+            else:
+                return self.tokenize(input.encode("utf-8"), special=True)
+            
+        def detokenize(tokens, hf_tokenizer):
+            if hf_tokenizer is not None:
+                return hf_tokenizer.decode(tokens).encode("utf-8")
+            else:
+                return self.detokenize(tokens)
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
@@ -760,7 +774,7 @@ def _create_completion(
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens: List[int] = (
             (
-                self.tokenize(prompt.encode("utf-8"), special=True)
+                tokenize(prompt, hf_tokenizer)
                 if prompt != ""
                 else [self.token_bos()]
             )
@@ -866,13 +880,13 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             if token == self._token_eos:
-                text = self.detokenize(completion_tokens)
+                text = detokenize(completion_tokens, hf_tokenizer)
                 finish_reason = "stop"
                 break
 
             completion_tokens.append(token)
 
-            all_text = self.detokenize(completion_tokens)
+            all_text = detokenize(completion_tokens, hf_tokenizer)
 
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
@@ -896,7 +910,7 @@ def logit_bias_processor(
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_text = self.detokenize(remaining_tokens)
+                remaining_text = detokenize(remaining_tokens, hf_tokenizer)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -918,17 +932,17 @@ def logit_bias_processor(
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(self.detokenize([token]))
+                        token_end_position += len(detokenize([token], hf_tokenizer))
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = self.detokenize([token]).decode(
+                        token_str = detokenize([token], hf_tokenizer).decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            self.detokenize(completion_tokens[:returned_tokens]).decode(
+                            detokenize(completion_tokens[:returned_tokens], hf_tokenizer).decode(
                                 "utf-8", errors="ignore"
                             )
                         )
@@ -942,7 +956,7 @@ def logit_bias_processor(
                             )
                         )
                         top_logprob = {
-                            self.detokenize([i]).decode(
+                            detokenize([i], hf_tokenizer).decode(
                                 "utf-8", errors="ignore"
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
@@ -950,7 +964,7 @@ def logit_bias_processor(
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
                             "tokens": [
-                                self.detokenize([token]).decode(
+                                detokenize([token], hf_tokenizer).decode(
                                     "utf-8", errors="ignore"
                                 )
                             ],
@@ -966,7 +980,7 @@ def logit_bias_processor(
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": self.detokenize([token]).decode(
+                                    "text": detokenize([token], hf_tokenizer).decode(
                                         "utf-8", errors="ignore"
                                     ),
                                     "index": 0,
@@ -980,7 +994,7 @@ def logit_bias_processor(
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = self.detokenize(remaining_tokens[:i])
+                                bs = detokenize(remaining_tokens[:i], hf_tokenizer)
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1015,14 +1029,14 @@ def logit_bias_processor(
                         }
 
             if len(completion_tokens) >= max_tokens:
-                text = self.detokenize(completion_tokens)
+                text = detokenize(completion_tokens, hf_tokenizer)
                 finish_reason = "length"
                 break
 
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
-            text = self.detokenize(completion_tokens)
+            text = detokenize(completion_tokens, hf_tokenizer)
             finish_reason = "stop"
 
         if self.verbose:
@@ -1030,7 +1044,7 @@ def logit_bias_processor(
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
-            all_text = self.detokenize(remaining_tokens)
+            all_text = detokenize(remaining_tokens, hf_tokenizer)
             any_stop = [s for s in stop_sequences if s in all_text]
             if len(any_stop) > 0:
                 end = min(all_text.index(stop) for stop in any_stop)
@@ -1039,17 +1053,17 @@ def logit_bias_processor(
 
             token_end_position = 0
             for token in remaining_tokens:
-                token_end_position += len(self.detokenize([token]))
+                token_end_position += len(detokenize([token], hf_tokenizer))
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
                 if logprobs is not None:
                     if token == self.token_bos():
                         continue
-                    token_str = self.detokenize([token]).decode(
+                    token_str = detokenize([token], hf_tokenizer).decode(
                         "utf-8", errors="ignore"
                     )
                     text_offset = len(prompt) + len(
-                        self.detokenize(completion_tokens[:returned_tokens])
+                        detokenize(completion_tokens[:returned_tokens], hf_tokenizer)
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
@@ -1061,13 +1075,13 @@ def logit_bias_processor(
                         )
                     )
                     top_logprob = {
-                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                        detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
                     }
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                            detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1075,7 +1089,7 @@ def logit_bias_processor(
                     }
 
                 if token_end_position >= end:
-                    last_text = self.detokenize([token])
+                    last_text = detokenize([token], hf_tokenizer)
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
@@ -1104,7 +1118,7 @@ def logit_bias_processor(
                     "model": model_name,
                     "choices": [
                         {
-                            "text": self.detokenize([token]).decode(
+                            "text": detokenize([token], hf_tokenizer).decode(
                                 "utf-8", errors="ignore"
                             ),
                             "index": 0,
@@ -1163,7 +1177,7 @@ def logit_bias_processor(
                 all_tokens = completion_tokens
 
             all_token_strs = [
-                self.detokenize([token]).decode("utf-8", errors="ignore")
+                detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
@@ -1176,7 +1190,7 @@ def logit_bias_processor(
                 text_offsets.append(
                     text_offset
                     + len(
-                        self.detokenize(all_tokens[:idx]).decode(
+                        detokenize(all_tokens[:idx], hf_tokenizer).decode(
                             "utf-8", errors="ignore"
                         )
                     )
@@ -1189,7 +1203,7 @@ def logit_bias_processor(
                 )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
-                    self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                    detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
                 top_logprob.update({token_str: logprobs_token[int(token)]})
@@ -1254,6 +1268,7 @@ def create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        hf_tokenizer: Optional[Any] = None,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1317,6 +1332,7 @@ def create_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            hf_tokenizer=hf_tokenizer,
         )
         if stream:
             chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -1444,6 +1460,7 @@ def create_chat_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        hf_tokenizer_path: Optional[str] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1511,6 +1528,7 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            hf_tokenizer_path=hf_tokenizer_path,
         )
 
     def __getstate__(self):
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index ed1191c6d..7916d7d6e 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -927,8 +927,8 @@ def format_saiga(
     return ChatFormatterResponse(prompt=_prompt.strip())
 
 
-@register_chat_completion_handler("functionary-v1")
-def functionary_v1_chat_handler(
+@register_chat_completion_handler("functionary")
+def functionary_chat_handler(
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
@@ -954,6 +954,7 @@ def functionary_v1_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    hf_tokenizer_path: Optional[str] = None,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -1102,13 +1103,14 @@ def prepare_messages_for_inference(
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
         
+    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
     from transformers import AutoTokenizer
     
-    tokenizer_path = os.path.dirname(llama.model_path)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
 
     prompt = prepare_messages_for_inference(messages, tokenizer, functions, tools)
-
+    
+    # If no tools/functions are provided
     if function_call is None and (functions is None or len(functions) == 0):
         completion_or_completion_chunks = llama.create_completion(
             prompt=prompt,
@@ -1130,39 +1132,23 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            hf_tokenizer=tokenizer
         )
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
-
-    if function_call is None or (
-        isinstance(function_call, str) and function_call == "auto"
-    ):
-        stop = [END_ASSISTANT_TOKEN, END_FUNCTION_CALL_TOKEN]
-        completion: llama_types.Completion = llama.create_completion(
-            prompt=prompt + ":\n", stop=stop, stream=False, max_tokens=max_tokens
-        )  # type: ignore
-        completion_text = completion["choices"][0]["text"]
-        # strip " to=functions." and ending ":"
-        function_call = completion_text.split(".")[-1][:-1]
-        new_prompt = prompt + completion_text + stop
-    elif isinstance(function_call, str) and function_call != "none":
-        new_prompt = prompt + f":\n"
-    elif isinstance(function_call, dict):
-        new_prompt = prompt + f" to=functions.{function_call['name']}:\n"
-        function_call = function_call["name"]
-    else:
-        new_prompt = prompt + f":\n"
-
-    function_body = None
-    for function in functions or []:
-        if function["name"] == function_call:
-            function_body = function["parameters"]
-            break
-    for tool in tools or []:
-        if tool["type"] == "function" and tool["function"]["name"] == function_call:
-            function_body = tool["function"]["parameters"]
-            break
-
-    if function_body is not None:
+    
+    assert stream is False  # TODO: support stream mode
+    
+    def get_grammar(function_call):
+        function_body = None
+        for function in functions or []:
+            if function["name"] == function_call:
+                function_body = function["parameters"]
+                break
+        for tool in tools or []:
+            if tool["type"] == "function" and tool["function"]["name"] == function_call:
+                function_body = tool["function"]["parameters"]
+                break
+            
         try:
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar_text = llama_grammar.json_schema_to_gbnf(
@@ -1182,21 +1168,38 @@ def prepare_messages_for_inference(
                 grammar = llama_grammar.LlamaGrammar.from_string(
                     llama_grammar.JSON_GBNF
                 )
+        
+        return grammar
+    
+    # If no or "auto" tool_choice/function_call
+    if function_call is None or (
+        isinstance(function_call, str) and function_call == "auto"
+    ):
+        stops = ["\n", END_ASSISTANT_TOKEN]
+    # If tool_choice/function_call is "none"
+    elif isinstance(function_call, str) and function_call == "none":
+        prompt = prepare_messages_for_inference(messages, tokenizer, [], [])
+        stops = END_ASSISTANT_TOKEN
+    # If tool_choice/function_call is provided
+    elif isinstance(function_call, dict):
+        prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+        function_call = function_call["name"]
+        stops = END_FUNCTION_CALL_TOKEN
+        grammar = get_grammar(function_call)
     else:
-        with suppress_stdout_stderr(disable=llama.verbose):
-            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
-
+        prompt = prompt
+        stops = ["\n", END_ASSISTANT_TOKEN]
+        
     completion: llama_types.Completion = llama.create_completion(
-        prompt=new_prompt,
-        stop=["user:", "</s>"],
-        stream=False,
-        grammar=grammar,
-        max_tokens=max_tokens,
+        prompt=prompt,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
         min_p=min_p,
         typical_p=typical_p,
+        stream=stream,
+        stop=stops,
+        max_tokens=max_tokens,
         presence_penalty=presence_penalty,
         frequency_penalty=frequency_penalty,
         repeat_penalty=repeat_penalty,
@@ -1206,11 +1209,46 @@ def prepare_messages_for_inference(
         mirostat_eta=mirostat_eta,
         model=model,
         logits_processor=logits_processor,
-    )  # type: ignore
-
+        grammar=grammar,
+        hf_tokenizer=tokenizer,
+    )
+    completion_text = completion["choices"][0]["text"]
+    
+    # If the generation does not involve a function call
+    if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+        return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+    elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
+        new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+        function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+        grammar = get_grammar(function_call)
+        
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=new_prompt,
+            stop=END_FUNCTION_CALL_TOKEN,
+            stream=stream,
+            grammar=grammar,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            hf_tokenizer=tokenizer,
+        )  # type: ignore
+    else:
+        new_prompt = prompt
+    
     assert "usage" in completion
     assert isinstance(function_call, str)
-    assert stream is False  # TODO: support stream mode
 
     if llama.verbose:
         print(new_prompt)
@@ -1230,7 +1268,7 @@ def prepare_messages_for_inference(
                     "content": None,
                     "function_call": {
                         "name": function_call,
-                        "arguments": completion["choices"][0]["text"],
+                        "arguments": completion["choices"][0]["text"].strip(),
                     },
                     "tool_calls": [
                         {
@@ -1238,7 +1276,7 @@ def prepare_messages_for_inference(
                             "type": "function",
                             "function": {
                                 "name": function_call,
-                                "arguments": completion["choices"][0]["text"],
+                                "arguments": completion["choices"][0]["text"].strip(),
                             },
                         }
                     ],

From 9580beeaa4d144fcfe22e83cd17924f544f7baa4 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 10:56:54 +0000
Subject: [PATCH 03/27] integrate functionary v2 prompt template

---
 llama_cpp/llama_chat_format.py | 191 ++++++++++++++++++++++++---------
 1 file changed, 139 insertions(+), 52 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 7916d7d6e..f1652df2b 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,7 +4,7 @@
 import json
 import ctypes
 import dataclasses
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
 
 import jinja2
 
@@ -958,12 +958,26 @@ def functionary_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
-    END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
-    END_USER_TOKEN = "<|END_OF_USER|>"
-    END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
-    END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
-    START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
-    END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
+    
+    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
+    from transformers import AutoTokenizer
+    
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+    
+    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.additional_special_tokens:
+        version = "v1"
+        END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
+        END_USER_TOKEN = "<|END_OF_USER|>"
+        END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
+        END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
+        START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
+        END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
+    else:
+        version = "v2"
+        RECIPIENT_TOKEN = "<|recipient|>"
+        FROM_TOKEN = "<|from|>"
+        STOP_TOKEN = "<|stop|>"
+        CONTENT_TOKEN = "<|content|>"
 
     def generate_type_definition(
         param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
@@ -1052,6 +1066,7 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
     def prepare_messages_for_inference(
         messages: List[llama_types.ChatCompletionRequestMessage],
         tokenizer: AutoTokenizer,
+        version: Literal["v1", "v2"],
         functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
     ):
@@ -1092,8 +1107,13 @@ def prepare_messages_for_inference(
                     "name"
                 ] = f"functions.{message['function_call']['name']}"
             all_messages.append(message)
+            
+        if version == "v1":
+            suffix = "assistant:\n"
+        else:
+            suffix = "<|from|>assistant\n<|recipient|>"
         
-        return tokenizer.apply_chat_template(all_messages, tokenize=False) + "assistant:\n"
+        return tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1102,16 +1122,17 @@ def prepare_messages_for_inference(
         function_call = (
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
-        
-    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
-    from transformers import AutoTokenizer
-    
-    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
 
-    prompt = prepare_messages_for_inference(messages, tokenizer, functions, tools)
+    prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools)
     
     # If no tools/functions are provided
     if function_call is None and (functions is None or len(functions) == 0):
+        if version == "v1":
+            stop = END_ASSISTANT_TOKEN
+        else:
+            stop = STOP_TOKEN
+            prompt += "all\n<|content|>"
+        
         completion_or_completion_chunks = llama.create_completion(
             prompt=prompt,
             temperature=temperature,
@@ -1120,7 +1141,7 @@ def prepare_messages_for_inference(
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stop=["user:", END_ASSISTANT_TOKEN],
+            stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
@@ -1175,20 +1196,24 @@ def get_grammar(function_call):
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
     ):
-        stops = ["\n", END_ASSISTANT_TOKEN]
+        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else CONTENT_TOKEN
     # If tool_choice/function_call is "none"
     elif isinstance(function_call, str) and function_call == "none":
-        prompt = prepare_messages_for_inference(messages, tokenizer, [], [])
-        stops = END_ASSISTANT_TOKEN
+        prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+        stops = END_ASSISTANT_TOKEN if version == "v1" else STOP_TOKEN
     # If tool_choice/function_call is provided
     elif isinstance(function_call, dict):
-        prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+        if version == "v1":
+            prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+            stops = END_FUNCTION_CALL_TOKEN
+        else:
+            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+            stops = STOP_TOKEN
         function_call = function_call["name"]
-        stops = END_FUNCTION_CALL_TOKEN
         grammar = get_grammar(function_call)
     else:
         prompt = prompt
-        stops = ["\n", END_ASSISTANT_TOKEN]
+        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
         
     completion: llama_types.Completion = llama.create_completion(
         prompt=prompt,
@@ -1214,38 +1239,100 @@ def get_grammar(function_call):
     )
     completion_text = completion["choices"][0]["text"]
     
-    # If the generation does not involve a function call
-    if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
-        return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-    elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
-        new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
-        function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
-        grammar = get_grammar(function_call)
-        
-        completion: llama_types.Completion = llama.create_completion(
-            prompt=new_prompt,
-            stop=END_FUNCTION_CALL_TOKEN,
-            stream=stream,
-            grammar=grammar,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=min_p,
-            typical_p=typical_p,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            repeat_penalty=repeat_penalty,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            model=model,
-            logits_processor=logits_processor,
-            hf_tokenizer=tokenizer,
-        )  # type: ignore
+    if version == "v1":
+        # If the generation does not involve a function call
+        if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
+            new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+            function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+            grammar = get_grammar(function_call)
+            
+            completion: llama_types.Completion = llama.create_completion(
+                prompt=new_prompt,
+                stop=END_FUNCTION_CALL_TOKEN,
+                stream=stream,
+                grammar=grammar,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                hf_tokenizer=tokenizer,
+            )  # type: ignore
+        else:
+            new_prompt = prompt
     else:
-        new_prompt = prompt
+        # If the generation does not involve a function call
+        if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+            new_prompt = prompt + completion_text + CONTENT_TOKEN
+            
+            completion: llama_types.Completion = llama.create_completion(
+                prompt=new_prompt,
+                stop=STOP_TOKEN,
+                stream=stream,
+                grammar=grammar,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                hf_tokenizer=tokenizer,
+            )  # type: ignore
+            
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        elif not prompt.endswith(CONTENT_TOKEN):
+            new_prompt = prompt + completion_text + CONTENT_TOKEN
+            function_call = completion_text[:-1].strip()
+            grammar = get_grammar(function_call)
+        
+            completion: llama_types.Completion = llama.create_completion(
+                prompt=new_prompt,
+                stop=STOP_TOKEN,
+                stream=stream,
+                grammar=grammar,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                hf_tokenizer=tokenizer,
+            )  # type: ignore
+        else:
+            new_prompt = prompt
+            
     
     assert "usage" in completion
     assert isinstance(function_call, str)

From bb48a838feafe10830745d39353f03fcc6d6c079 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 11:16:19 +0000
Subject: [PATCH 04/27] update readme

---
 README.md          | 8 +++++---
 llama_cpp/llama.py | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f97ea0f77..a25ad580e 100644
--- a/README.md
+++ b/README.md
@@ -209,8 +209,9 @@ Chat completion is available through the [`create_chat_completion`](https://llam
 
 The high-level API also provides a simple interface for function calling.
 
-Note that the only model that supports full function calling at this time is "functionary".
-The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
+Note that the only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary).
+The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai).
+Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
 
 
 ```python
@@ -254,7 +255,8 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         "function": {
           "name": "UserDetail"
         }
-      }]
+      }],
+      hf_tokenizer_path="path/to/functionary-gguf/"
 )
 ```
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2fdd1b6e3..2ee738446 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1298,6 +1298,7 @@ def create_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use for constrained sampling.
             logit_bias: A logit bias to use.
+            hf_tokenizer: A HuggingFace AutoTokenizer to use optionally.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1493,6 +1494,7 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
+            hf_tokenizer_path: A HuggingFace AutoTokenizer file path to use.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.

From 036993119c81f1932b707060aade2edc55f9eebb Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 13:23:36 +0000
Subject: [PATCH 05/27] set up parallel function calling wip

---
 llama_cpp/llama_chat_format.py | 51 +++++++++++++++-------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f1652df2b..985eb909c 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1196,7 +1196,7 @@ def get_grammar(function_call):
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
     ):
-        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else CONTENT_TOKEN
+        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
     # If tool_choice/function_call is "none"
     elif isinstance(function_call, str) and function_call == "none":
         prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
@@ -1303,39 +1303,32 @@ def get_grammar(function_call):
             )  # type: ignore
             
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        elif not prompt.endswith(CONTENT_TOKEN):
-            new_prompt = prompt + completion_text + CONTENT_TOKEN
-            function_call = completion_text[:-1].strip()
-            grammar = get_grammar(function_call)
-        
-            completion: llama_types.Completion = llama.create_completion(
-                prompt=new_prompt,
-                stop=STOP_TOKEN,
-                stream=stream,
-                grammar=grammar,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                hf_tokenizer=tokenizer,
-            )  # type: ignore
+        elif prompt.endswith(RECIPIENT_TOKEN):
+            all_calls = completion_text.split(f"\n{FROM_TOKEN} assistant\n{RECIPIENT_TOKEN}")
+            function_calls = [curr_call.split(f"\n{CONTENT_TOKEN}")[0].strip() for curr_call in all_calls]
+            function_bodies = [curr_call.split(f"\n{CONTENT_TOKEN}")[1].strip() for curr_call in all_calls]
+            breakpoint()
         else:
             new_prompt = prompt
             
     
     assert "usage" in completion
-    assert isinstance(function_call, str)
+    if function_call is not None:
+        assert isinstance(function_call, str)
+        tool_calls = 
+    else:
+        tool_calls = []
+        for function_call, function_body in zip(function_calls, function_bodies):
+            tool_calls.append(
+                {
+                    "id": function_call,
+                    "type": "function",
+                    "function": {
+                        "name": function_call,
+                        "arguments": function_body,
+                    },
+                }
+            )
 
     if llama.verbose:
         print(new_prompt)

From 9540df9b1bc5239b007de19965fcef72bc4b6083 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 16:37:16 +0000
Subject: [PATCH 06/27] set up parallel function calling

---
 README.md                      |  11 +-
 llama_cpp/llama_chat_format.py | 273 ++++++++++++++++-----------------
 2 files changed, 136 insertions(+), 148 deletions(-)

diff --git a/README.md b/README.md
index a25ad580e..f63645c77 100644
--- a/README.md
+++ b/README.md
@@ -209,8 +209,8 @@ Chat completion is available through the [`create_chat_completion`](https://llam
 
 The high-level API also provides a simple interface for function calling.
 
-Note that the only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary).
-The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai).
+The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**.
+
 Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
 
 
@@ -218,12 +218,8 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
 >>> llm.create_chat_completion(
+      hf_tokenizer_path="path/to/functionary-gguf/"
       messages = [
-        {
-          "role": "system",
-          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
-          
-        },
         {
           "role": "user",
           "content": "Extract Jason is 25 years old"
@@ -256,7 +252,6 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
           "name": "UserDetail"
         }
       }],
-      hf_tokenizer_path="path/to/functionary-gguf/"
 )
 ```
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 985eb909c..0311163e5 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,6 +4,8 @@
 import json
 import ctypes
 import dataclasses
+import random
+import string
 from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
 
 import jinja2
@@ -1192,147 +1194,147 @@ def get_grammar(function_call):
         
         return grammar
     
-    # If no or "auto" tool_choice/function_call
-    if function_call is None or (
-        isinstance(function_call, str) and function_call == "auto"
-    ):
-        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
-    # If tool_choice/function_call is "none"
-    elif isinstance(function_call, str) and function_call == "none":
-        prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
-        stops = END_ASSISTANT_TOKEN if version == "v1" else STOP_TOKEN
-    # If tool_choice/function_call is provided
-    elif isinstance(function_call, dict):
-        if version == "v1":
+    def create_completion(stop):
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            hf_tokenizer=tokenizer,
+        )
+        
+        return completion
+    
+    function_calls, function_bodies = [], []
+    
+    if version == "v1":
+        # If no or "auto" tool_choice/function_call
+        if function_call is None or (
+            isinstance(function_call, str) and function_call == "auto"
+        ):
+            stops = ["\n", END_ASSISTANT_TOKEN]
+        # If tool_choice/function_call is "none"
+        elif isinstance(function_call, str) and function_call == "none":
+            prompt = prepare_messages_for_inference(messages, tokenizer, version, [], [])
+            stops = END_ASSISTANT_TOKEN
+        # If tool_choice/function_call is provided
+        elif isinstance(function_call, dict):
             prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
             stops = END_FUNCTION_CALL_TOKEN
+            function_call = function_call["name"]
+            function_calls.append(function_call)
+            grammar = get_grammar(function_call)
         else:
-            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
-            stops = STOP_TOKEN
-        function_call = function_call["name"]
-        grammar = get_grammar(function_call)
-    else:
-        prompt = prompt
-        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
+            prompt = prompt
+            stops = ["\n", END_ASSISTANT_TOKEN]
+            
+        completion = create_completion(stop=stops)
+        completion_text = completion["choices"][0]["text"]
         
-    completion: llama_types.Completion = llama.create_completion(
-        prompt=prompt,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        min_p=min_p,
-        typical_p=typical_p,
-        stream=stream,
-        stop=stops,
-        max_tokens=max_tokens,
-        presence_penalty=presence_penalty,
-        frequency_penalty=frequency_penalty,
-        repeat_penalty=repeat_penalty,
-        tfs_z=tfs_z,
-        mirostat_mode=mirostat_mode,
-        mirostat_tau=mirostat_tau,
-        mirostat_eta=mirostat_eta,
-        model=model,
-        logits_processor=logits_processor,
-        grammar=grammar,
-        hf_tokenizer=tokenizer,
-    )
-    completion_text = completion["choices"][0]["text"]
-    
-    if version == "v1":
         # If the generation does not involve a function call
         if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        # If the generation involves a function call in completion, generate the parameters
         elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
-            new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
-            function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
-            grammar = get_grammar(function_call)
-            
-            completion: llama_types.Completion = llama.create_completion(
-                prompt=new_prompt,
-                stop=END_FUNCTION_CALL_TOKEN,
-                stream=stream,
-                grammar=grammar,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                hf_tokenizer=tokenizer,
-            )  # type: ignore
+            prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+            function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip())
+            grammar = get_grammar(function_calls[-1])
+            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+            function_bodies.append(completion["choices"][0]["text"].strip())
+        # If the prompt involves a function call, just append generated parameters to function_bodies
         else:
-            new_prompt = prompt
+            function_bodies.append(completion_text.strip())
     else:
-        # If the generation does not involve a function call
-        if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
-            new_prompt = prompt + completion_text + CONTENT_TOKEN
-            
-            completion: llama_types.Completion = llama.create_completion(
-                prompt=new_prompt,
-                stop=STOP_TOKEN,
-                stream=stream,
-                grammar=grammar,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                hf_tokenizer=tokenizer,
-            )  # type: ignore
+        # Loop until all parallel function calls are generated
+        while True:
+            # If no or "auto" tool_choice/function_call
+            if function_call is None or (
+                isinstance(function_call, str) and function_call == "auto"
+            ):
+                grammar = None
+                stops = CONTENT_TOKEN
+            # If tool_choice/function_call is "none"
+            elif isinstance(function_call, str) and function_call == "none":
+                prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+                stops = STOP_TOKEN
+            # If tool_choice/function_call is provided
+            elif isinstance(function_call, dict):
+                prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+                stops = STOP_TOKEN
+                function_call = function_call["name"]
+                function_calls.append(function_call)
+                grammar = get_grammar(function_call)
+            else:
+                prompt = prompt
+                stops = STOP_TOKEN
+                
+            completion = create_completion(stop=stops)
+            completion_text = completion["choices"][0]["text"]
             
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        elif prompt.endswith(RECIPIENT_TOKEN):
-            all_calls = completion_text.split(f"\n{FROM_TOKEN} assistant\n{RECIPIENT_TOKEN}")
-            function_calls = [curr_call.split(f"\n{CONTENT_TOKEN}")[0].strip() for curr_call in all_calls]
-            function_bodies = [curr_call.split(f"\n{CONTENT_TOKEN}")[1].strip() for curr_call in all_calls]
-            breakpoint()
-        else:
-            new_prompt = prompt
+            # If the generation does not involve a function call
+            if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # Generate model response if the model decides not to call any function
+            elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+                prompt += completion_text + CONTENT_TOKEN
+                completion = create_completion(stop=STOP_TOKEN)
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # Generate parameters if model decides to call a function
+            elif prompt.endswith(RECIPIENT_TOKEN):
+                function_calls.append(completion_text[:-1])
+                grammar = get_grammar(function_calls[-1])
+                completion = create_completion(stop=[STOP_TOKEN, "\n"])
+                function_bodies.append(completion["choices"][0]["text"].strip())
+                prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
+                grammar = None
+                
+                # Try to generate the beginning of next turn
+                # If empty completion, break from loop
+                next_turn_completion_text = create_completion(
+                    stop=[STOP_TOKEN, RECIPIENT_TOKEN]
+                )["choices"][0]["text"]
+                if len(next_turn_completion_text) > 0:
+                    prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
+                else:
+                    break
+            # Break from loop if tool_choice/function_call is provided as a dict
+            else:
+                function_bodies.append(completion_text.strip())
+                break
             
-    
     assert "usage" in completion
-    if function_call is not None:
-        assert isinstance(function_call, str)
-        tool_calls = 
-    else:
-        tool_calls = []
-        for function_call, function_body in zip(function_calls, function_bodies):
-            tool_calls.append(
-                {
-                    "id": function_call,
-                    "type": "function",
-                    "function": {
-                        "name": function_call,
-                        "arguments": function_body,
-                    },
-                }
-            )
-
-    if llama.verbose:
-        print(new_prompt)
-        print(completion["choices"][0]["text"])
+    assert len(function_calls) > 0
+    assert len(function_calls) == len(function_bodies)
+    
+    tool_calls = []
+    for function_call, function_body in zip(function_calls, function_bodies):
+        tool_calls.append(
+            {
+                "id": "call_" + "".join(
+                    [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                ),
+                "type": "function",
+                "function": {
+                    "name": function_call,
+                    "arguments": function_body,
+                },
+            }
+        )
 
     # TODO: support stream mode
     return llama_types.CreateChatCompletionResponse(
@@ -1347,19 +1349,10 @@ def get_grammar(function_call):
                     "role": "assistant",
                     "content": None,
                     "function_call": {
-                        "name": function_call,
-                        "arguments": completion["choices"][0]["text"].strip(),
+                        "name": tool_calls[0]["function"]["name"],
+                        "arguments": tool_calls[0]["function"]["arguments"],
                     },
-                    "tool_calls": [
-                        {
-                            "id": function_call,
-                            "type": "function",
-                            "function": {
-                                "name": function_call,
-                                "arguments": completion["choices"][0]["text"].strip(),
-                            },
-                        }
-                    ],
+                    "tool_calls": tool_calls,
                 },
                 "finish_reason": "tool_calls",
             }

From c71863cd2a4cf1afe19d1c717178975c59fd6464 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Fri, 12 Jan 2024 12:01:15 +0800
Subject: [PATCH 07/27] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f63645c77..3a9819363 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
 >>> llm.create_chat_completion(
-      hf_tokenizer_path="path/to/functionary-gguf/"
+      hf_tokenizer_path="path/to/functionary-gguf/",
       messages = [
         {
           "role": "user",

From 4cf87363fc877b2c110a113cc59689751940b744 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Fri, 12 Jan 2024 12:03:41 +0800
Subject: [PATCH 08/27] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3a9819363..611f11753 100644
--- a/README.md
+++ b/README.md
@@ -246,12 +246,12 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
           }
         }
       }],
-      tool_choice=[{
+      tool_choice={
         "type": "function",
         "function": {
           "name": "UserDetail"
         }
-      }],
+      },
 )
 ```
 

From ae7009b9eeb30ebfd12b81d2b4cefa7dda3cc4ca Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Tue, 23 Jan 2024 09:53:20 +0000
Subject: [PATCH 09/27] refactor tokenizers

---
 llama_cpp/llama.py             | 120 ++++++++++++++++++---------------
 llama_cpp/llama_chat_format.py |  14 ++--
 2 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2ee738446..5ace18d72 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -15,6 +15,7 @@
     Deque,
     Callable,
     Any,
+    Protocol,
 )
 from collections import deque
 
@@ -61,6 +62,8 @@ def __init__(
         use_mmap: bool = True,
         use_mlock: bool = False,
         kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
+        # Tokenizer Params (Optionally for HF AutoTokenizers)
+        hf_tokenizer_path: Optional[str] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
@@ -131,6 +134,7 @@ def __init__(
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             kv_overrides: Key-value overrides for the model.
+            hf_tokenizer_path: Override llama.cpp tokenizer with HF AutoTokenizer from this path if provided.
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
@@ -228,6 +232,13 @@ def __init__(
         self.n_threads_batch = n_threads_batch or max(
             multiprocessing.cpu_count() // 2, 1
         )
+        
+        # Tokenizer Params
+        if hf_tokenizer_path is not None:
+            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
+        else:
+            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
+        
         # Context Params
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.seed = seed
@@ -385,7 +396,7 @@ def tokenize(
         Returns:
             A list of tokens.
         """
-        return self._model.tokenize(text, add_bos, special)
+        return self._tokenizer_to_use.encode(text, add_bos, special)
 
     def detokenize(self, tokens: List[int]) -> bytes:
         """Detokenize a list of tokens.
@@ -396,7 +407,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
         Returns:
             The detokenized string.
         """
-        return self._model.detokenize(tokens)
+        return self._tokenizer_to_use.decode(tokens)
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -747,24 +758,11 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
-        hf_tokenizer: Optional[Any] = None,
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
-        
-        def tokenize(input, hf_tokenizer):
-            if hf_tokenizer is not None:
-                return hf_tokenizer.encode(input)
-            else:
-                return self.tokenize(input.encode("utf-8"), special=True)
-            
-        def detokenize(tokens, hf_tokenizer):
-            if hf_tokenizer is not None:
-                return hf_tokenizer.decode(tokens).encode("utf-8")
-            else:
-                return self.detokenize(tokens)
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
@@ -774,7 +772,7 @@ def detokenize(tokens, hf_tokenizer):
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens: List[int] = (
             (
-                tokenize(prompt, hf_tokenizer)
+                self.tokenize(prompt.encode("utf-8"), special=True)
                 if prompt != ""
                 else [self.token_bos()]
             )
@@ -880,13 +878,13 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             if token == self._token_eos:
-                text = detokenize(completion_tokens, hf_tokenizer)
+                text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
                 break
 
             completion_tokens.append(token)
 
-            all_text = detokenize(completion_tokens, hf_tokenizer)
+            all_text = self.detokenize(completion_tokens)
 
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
@@ -910,7 +908,7 @@ def logit_bias_processor(
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_text = detokenize(remaining_tokens, hf_tokenizer)
+                remaining_text = self.detokenize(remaining_tokens)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -932,17 +930,17 @@ def logit_bias_processor(
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(detokenize([token], hf_tokenizer))
+                        token_end_position += len(self.detokenize([token]))
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = detokenize([token], hf_tokenizer).decode(
+                        token_str = self.detokenize([token]).decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            detokenize(completion_tokens[:returned_tokens], hf_tokenizer).decode(
+                            self.detokenize(completion_tokens[:returned_tokens]).decode(
                                 "utf-8", errors="ignore"
                             )
                         )
@@ -956,7 +954,7 @@ def logit_bias_processor(
                             )
                         )
                         top_logprob = {
-                            detokenize([i], hf_tokenizer).decode(
+                            self.detokenize([i]).decode(
                                 "utf-8", errors="ignore"
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
@@ -964,7 +962,7 @@ def logit_bias_processor(
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
                             "tokens": [
-                                detokenize([token], hf_tokenizer).decode(
+                                self.detokenize([token]).decode(
                                     "utf-8", errors="ignore"
                                 )
                             ],
@@ -980,7 +978,7 @@ def logit_bias_processor(
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": detokenize([token], hf_tokenizer).decode(
+                                    "text": self.detokenize([token]).decode(
                                         "utf-8", errors="ignore"
                                     ),
                                     "index": 0,
@@ -994,7 +992,7 @@ def logit_bias_processor(
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = detokenize(remaining_tokens[:i], hf_tokenizer)
+                                bs = self.detokenize(remaining_tokens[:i])
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1029,14 +1027,15 @@ def logit_bias_processor(
                         }
 
             if len(completion_tokens) >= max_tokens:
-                text = detokenize(completion_tokens, hf_tokenizer)
+                text = self.detokenize(completion_tokens)
+                
                 finish_reason = "length"
                 break
 
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
-            text = detokenize(completion_tokens, hf_tokenizer)
+            text = self.detokenize(completion_tokens)
             finish_reason = "stop"
 
         if self.verbose:
@@ -1044,7 +1043,7 @@ def logit_bias_processor(
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
-            all_text = detokenize(remaining_tokens, hf_tokenizer)
+            all_text = self.detokenize(remaining_tokens)
             any_stop = [s for s in stop_sequences if s in all_text]
             if len(any_stop) > 0:
                 end = min(all_text.index(stop) for stop in any_stop)
@@ -1053,17 +1052,17 @@ def logit_bias_processor(
 
             token_end_position = 0
             for token in remaining_tokens:
-                token_end_position += len(detokenize([token], hf_tokenizer))
+                token_end_position += len(self.detokenize([token]))
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
                 if logprobs is not None:
                     if token == self.token_bos():
                         continue
-                    token_str = detokenize([token], hf_tokenizer).decode(
+                    token_str = self.detokenize([token]).decode(
                         "utf-8", errors="ignore"
                     )
                     text_offset = len(prompt) + len(
-                        detokenize(completion_tokens[:returned_tokens], hf_tokenizer)
+                        self.detokenize(completion_tokens[:returned_tokens])
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
@@ -1075,13 +1074,13 @@ def logit_bias_processor(
                         )
                     )
                     top_logprob = {
-                        detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
+                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
                     }
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
+                            self.detokenize([token]).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1089,7 +1088,7 @@ def logit_bias_processor(
                     }
 
                 if token_end_position >= end:
-                    last_text = detokenize([token], hf_tokenizer)
+                    last_text = self.detokenize([token])
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
@@ -1118,7 +1117,7 @@ def logit_bias_processor(
                     "model": model_name,
                     "choices": [
                         {
-                            "text": detokenize([token], hf_tokenizer).decode(
+                            "text": self.detokenize([token]).decode(
                                 "utf-8", errors="ignore"
                             ),
                             "index": 0,
@@ -1177,7 +1176,7 @@ def logit_bias_processor(
                 all_tokens = completion_tokens
 
             all_token_strs = [
-                detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
+                self.detokenize([token]).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
@@ -1190,7 +1189,7 @@ def logit_bias_processor(
                 text_offsets.append(
                     text_offset
                     + len(
-                        detokenize(all_tokens[:idx], hf_tokenizer).decode(
+                        self.detokenize(all_tokens[:idx]).decode(
                             "utf-8", errors="ignore"
                         )
                     )
@@ -1203,7 +1202,7 @@ def logit_bias_processor(
                 )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
-                    detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
+                    self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
                 top_logprob.update({token_str: logprobs_token[int(token)]})
@@ -1268,7 +1267,6 @@ def create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
-        hf_tokenizer: Optional[Any] = None,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1298,7 +1296,6 @@ def create_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use for constrained sampling.
             logit_bias: A logit bias to use.
-            hf_tokenizer: A HuggingFace AutoTokenizer to use optionally.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1333,7 +1330,6 @@ def create_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
-            hf_tokenizer=hf_tokenizer,
         )
         if stream:
             chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -1461,7 +1457,6 @@ def create_chat_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
-        hf_tokenizer_path: Optional[str] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1494,7 +1489,6 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
-            hf_tokenizer_path: A HuggingFace AutoTokenizer file path to use.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.
@@ -1530,7 +1524,6 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
-            hf_tokenizer_path=hf_tokenizer_path,
         )
 
     def __getstate__(self):
@@ -1674,9 +1667,9 @@ def n_vocab(self) -> int:
         """Return the vocabulary size."""
         return self._model.n_vocab()
 
-    def tokenizer(self) -> "LlamaTokenizer":
+    def tokenizer(self) -> Union["LlamaCppTokenizer", "HFTokenizer"]:
         """Return the tokenizer for this model."""
-        return LlamaTokenizer(self)
+        return self._tokenizer_to_use
 
     def token_eos(self) -> int:
         """Return the end-of-sequence token."""
@@ -1719,21 +1712,40 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         return longest_prefix
 
 
-class LlamaTokenizer:
+class LlamaTokenizer(Protocol):
+    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+        ...
+    
+    def decode(self, tokens: List[int]) -> bytes:
+        ...
+
+class LlamaCppTokenizer:
     def __init__(self, llama: Llama):
         self.llama = llama
 
-    def encode(self, text: str, add_bos: bool = True) -> List[int]:
-        return self.llama.tokenize(
-            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
-        )
+    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+        return self.llama._model.tokenize(text, add_bos, special)
 
-    def decode(self, tokens: List[int]) -> str:
-        return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
+    def decode(self, tokens: List[int]) -> bytes:
+        return self.llama._model.detokenize(tokens)
 
     @classmethod
     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
         return cls(Llama(model_path=path, vocab_only=True))
+    
+
+class HFTokenizer:
+    def __init__(self, hf_tokenizer_path):
+        from transformers import AutoTokenizer
+        self.hf_tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+        
+    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+        return self.hf_tokenizer.encode(
+            text.decode("utf-8", errors="ignore"), add_special_tokens=special
+        )
+    
+    def decode(self, tokens: List[int]) -> bytes:
+        return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
 
 
 class LlamaState:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 0311163e5..42acad615 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -956,17 +956,15 @@ def functionary_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
-    hf_tokenizer_path: Optional[str] = None,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
     
-    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
+    tokenizer = llama.tokenizer()
+    assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
     from transformers import AutoTokenizer
     
-    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
-    
-    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.additional_special_tokens:
+    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
         version = "v1"
         END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
         END_USER_TOKEN = "<|END_OF_USER|>"
@@ -1115,7 +1113,7 @@ def prepare_messages_for_inference(
         else:
             suffix = "<|from|>assistant\n<|recipient|>"
         
-        return tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
+        return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1155,7 +1153,6 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
-            hf_tokenizer=tokenizer
         )
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
     
@@ -1215,7 +1212,6 @@ def create_completion(stop):
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
-            hf_tokenizer=tokenizer,
         )
         
         return completion
@@ -1242,7 +1238,7 @@ def create_completion(stop):
         else:
             prompt = prompt
             stops = ["\n", END_ASSISTANT_TOKEN]
-            
+
         completion = create_completion(stop=stops)
         completion_text = completion["choices"][0]["text"]
         

From ebb4ec037594e058c9b1d6006bb11c552f2f1c19 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Tue, 23 Jan 2024 14:09:00 +0000
Subject: [PATCH 10/27] include old functionary handler for backward
 compatibility

---
 README.md                      |   5 +-
 llama_cpp/llama.py             |  17 +-
 llama_cpp/llama_chat_format.py | 354 +++++++++++++++++++++++++++++++++
 3 files changed, 365 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 611f11753..c81106385 100644
--- a/README.md
+++ b/README.md
@@ -209,16 +209,15 @@ Chat completion is available through the [`create_chat_completion`](https://llam
 
 The high-level API also provides a simple interface for function calling.
 
-The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**.
+The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
 
 Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
 
 
 ```python
 >>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
+>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", hf_tokenizer_path="path/to/functionary-gguf/", chat_format="functionary-v2")
 >>> llm.create_chat_completion(
-      hf_tokenizer_path="path/to/functionary-gguf/",
       messages = [
         {
           "role": "user",
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 5ace18d72..94a5ae7e8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -233,12 +233,6 @@ def __init__(
             multiprocessing.cpu_count() // 2, 1
         )
         
-        # Tokenizer Params
-        if hf_tokenizer_path is not None:
-            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
-        else:
-            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
-        
         # Context Params
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.seed = seed
@@ -290,6 +284,13 @@ def __init__(
         self._model = _LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
         )
+        
+        # Tokenizer Params
+        if hf_tokenizer_path is not None:
+            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
+        else:
+            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
+        
         # Set the default value for the context and correct the batch
         if n_ctx == 0:
             n_ctx = self._model.n_ctx_train()
@@ -1724,10 +1725,10 @@ def __init__(self, llama: Llama):
         self.llama = llama
 
     def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
-        return self.llama._model.tokenize(text, add_bos, special)
+        return self.llama.tokenize(text, add_bos, special)
 
     def decode(self, tokens: List[int]) -> bytes:
-        return self.llama._model.detokenize(tokens)
+        return self.llama.detokenize(tokens)
 
     @classmethod
     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 42acad615..d48f48d65 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -959,6 +959,360 @@ def functionary_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+
+    def generate_type_definition(
+        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
+    ) -> str:
+        indent = "  " * indent_level
+        if "$ref" in param:
+            # Reference to a shared definition
+            ref_name = param["$ref"].split("/")[
+                -1
+            ]  # Extract the type name from the reference
+            return ref_name
+        elif param.get("type") == "array":
+            items = param.get("items", {})
+            item_type = generate_type_definition(items, indent_level + 1, shared_defs)
+            return f"Array<{item_type}>"
+        elif param.get("type") == "object":
+            properties = param.get("properties", {})
+            nested_schema = "{\n"
+            for nested_param_name, nested_param in properties.items():
+                nested_param_type = generate_type_definition(
+                    nested_param, indent_level + 1, shared_defs
+                )
+                nested_schema += (
+                    f"{indent}  {nested_param_name}: {nested_param_type},\n"
+                )
+            nested_schema += indent + "}"
+            return nested_schema
+        elif "enum" in param:
+            # Enum type
+            return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
+        else:
+            # Simple type
+            return param.get("type", "any")
+
+    def generate_shared_definitions(shared_defs, indent_level: int) -> str:
+        indent = "  " * indent_level
+        shared_definitions = ""
+        for def_name, def_properties in shared_defs.items():
+            shared_definitions += f"{indent}type {def_name} = "
+            if def_properties.get("type") == "object":
+                shared_definitions += generate_type_definition(
+                    def_properties, indent_level, shared_defs
+                )
+            elif "enum" in def_properties:
+                # Enum type
+                shared_definitions += " | ".join(
+                    [f'"{enum_value}"' for enum_value in def_properties["enum"]]
+                )
+            shared_definitions += ";\n"
+        return shared_definitions
+
+    def generate_schema_from_functions(functions, namespace="functions") -> str:
+        schema = (
+            "// Supported function definitions that should be called when necessary.\n"
+        )
+        schema += f"namespace {namespace} {{\n\n"
+
+        # Generate shared definitions
+        shared_definitions = {}
+        for function in functions:
+            parameters = function.get("parameters", {})
+            shared_definitions.update(parameters.get("$defs", {}))
+
+        schema += generate_shared_definitions(shared_definitions, 1)
+
+        for function in functions:
+            function_name = function["name"]
+            description = function.get("description", "")
+            parameters = function.get("parameters", {})
+            required_params = parameters.get("required", [])
+
+            schema += f"  // {description}\n"
+            schema += f"  type {function_name} = (_: {{\n"
+
+            for param_name, param in parameters.get("properties", {}).items():
+                param_description = param.get("description", "")
+                param_type = generate_type_definition(param, 2, shared_definitions)
+                optional_indicator = "" if param_name in required_params else "?"
+                schema += f"    // {param_description}\n"
+                schema += f"    {param_name}{optional_indicator}: {param_type},\n"
+            schema += "  }) => any;\n\n"
+
+        schema += "}} // namespace {}\n".format(namespace)
+        return schema
+
+    def prepare_messages_for_inference(
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    ):
+        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+        if functions is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system", content=generate_schema_from_functions(functions)
+                )
+            )
+
+        if tools is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system",
+                    content=generate_schema_from_functions(
+                        [
+                            tool["function"]
+                            for tool in tools
+                            if tool["type"] == "function"
+                        ]
+                    ),
+                )
+            )
+
+        all_messages.append(
+            llama_types.ChatCompletionRequestSystemMessage(
+                role="system", content=SYSTEM_MESSAGE
+            )
+        )
+
+        for message in messages:
+            # Function call responses
+            if message["role"] == "function" and "name" in message:
+                message["name"] = f"functions.{message['name']}"
+            # Function call requests by assistant
+            if "function_call" in message:
+                message["function_call"][
+                    "name"
+                ] = f"functions.{message['function_call']['name']}"
+            all_messages.append(message)
+
+        all_messages.append(
+            llama_types.ChatCompletionRequestAssistantMessage(
+                role="assistant", content=None
+            )
+        )
+
+        def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
+            if msg["role"] == "system":
+                return f"system:\n{msg['content']}\n"
+
+            elif msg["role"] == "function" and "name" in msg:
+                return f"function name={msg['name']}:\n{msg['content']}\n"
+            elif msg["role"] == "function" and "function_call" in msg:
+                return f"function name={msg['function_call']['name']}:\n{msg['function_call']['arguments']}\n"
+            elif msg["role"] == "tool":
+                if msg["content"] is not None:
+                    return f"function name={msg['tool_call_id']}:\n{msg['content']}\n"
+                else:
+                    return f"function name={msg['tool_call_id']}\n"
+            elif msg["role"] == "user":
+                if msg["content"] is None:
+                    return "user:\n</s></s>\n"
+                else:
+                    return f"user:\n</s>{msg['content']}</s>\n"
+            elif msg["role"] == "assistant":
+                if msg["content"] is not None and "function_call" in msg:
+                    return f"assistant:\n{msg['content']}\nassistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
+                elif "function_call" in msg:
+                    return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
+                elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
+                    for tool_call in msg[
+                        "tool_calls"
+                    ]:  # NOTE: probably doesn't work with the functionary model
+                        return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
+                elif msg["content"] is None:
+                    return "assistant"
+                else:
+                    return f"assistant:\n{msg['content']}\n"
+            else:
+                raise ValueError(f"Unsupported role: {msg['role']}")
+
+        return "".join([message_to_str(msg) for msg in all_messages])
+
+    if tools is not None:
+        functions = [tool["function"] for tool in tools if tool["type"] == "function"]
+
+    if tool_choice is not None:
+        function_call = (
+            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
+        )
+
+    prompt = prepare_messages_for_inference(messages, functions, tools)
+
+    if function_call is None and (functions is None or len(functions) == 0):
+        completion_or_completion_chunks = llama.create_completion(
+            prompt=prompt + ":\n",
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=["user:", "</s>"],
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
+
+    if function_call is None or (
+        isinstance(function_call, str) and function_call == "auto"
+    ):
+        stop = "\n"
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=prompt, stop=stop, stream=False
+        )  # type: ignore
+        completion_text = completion["choices"][0]["text"]
+        # strip " to=functions." and ending ":"
+        function_call = completion_text.split(".")[-1][:-1]
+        new_prompt = prompt + completion_text + stop
+    elif isinstance(function_call, str) and function_call != "none":
+        new_prompt = prompt + f":\n"
+    elif isinstance(function_call, dict):
+        new_prompt = prompt + f" to=functions.{function_call['name']}:\n"
+        function_call = function_call["name"]
+    else:
+        new_prompt = prompt + f":\n"
+
+    function_body = None
+    for function in functions or []:
+        if function["name"] == function_call:
+            function_body = function["parameters"]
+            break
+    for tool in tools or []:
+        if tool["type"] == "function" and tool["function"]["name"] == function_call:
+            function_body = tool["function"]["parameters"]
+            break
+
+    if function_body is not None:
+        try:
+            with suppress_stdout_stderr(disable=llama.verbose):
+                grammar_text = llama_grammar.json_schema_to_gbnf(
+                    json.dumps(function_body)
+                )
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                )
+                print(grammar_text)
+        except Exception as e:
+            if llama.verbose:
+                print(
+                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                )
+                print(e)
+            with suppress_stdout_stderr(disable=llama.verbose):
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF
+                )
+    else:
+        with suppress_stdout_stderr(disable=llama.verbose):
+            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+
+    completion: llama_types.Completion = llama.create_completion(
+        prompt=new_prompt,
+        stop=["user:", "</s>"],
+        stream=False,
+        grammar=grammar,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        min_p=min_p,
+        typical_p=typical_p,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        repeat_penalty=repeat_penalty,
+        tfs_z=tfs_z,
+        mirostat_mode=mirostat_mode,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        model=model,
+        logits_processor=logits_processor,
+    )  # type: ignore
+
+    assert "usage" in completion
+    assert isinstance(function_call, str)
+    assert stream is False  # TODO: support stream mode
+
+    if llama.verbose:
+        print(new_prompt)
+        print(completion["choices"][0]["text"])
+
+    # TODO: support stream mode
+    return llama_types.CreateChatCompletionResponse(
+        id="chat" + completion["id"],
+        object="chat.completion",
+        created=completion["created"],
+        model=completion["model"],
+        choices=[
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": None,
+                    "function_call": {
+                        "name": function_call,
+                        "arguments": completion["choices"][0]["text"],
+                    },
+                    "tool_calls": [
+                        {
+                            "id": function_call,
+                            "type": "function",
+                            "function": {
+                                "name": function_call,
+                                "arguments": completion["choices"][0]["text"],
+                            },
+                        }
+                    ],
+                },
+                "finish_reason": "tool_calls",
+            }
+        ],
+        usage=completion["usage"],
+    )
+
+
+@register_chat_completion_handler("functionary-v1")
+@register_chat_completion_handler("functionary-v2")
+def functionary_v1_v2_chat_handler(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
+) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+    SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
     
     tokenizer = llama.tokenizer()
     assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"

From 9594d5c888e963d8e756a79ebcd3a1ece65384e3 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Tue, 23 Jan 2024 15:49:48 +0000
Subject: [PATCH 11/27] add hf_tokenizer_path in server ModelSettings

---
 llama_cpp/server/model.py    | 2 ++
 llama_cpp/server/settings.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index bbb68069d..5fd0f0861 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -119,6 +119,8 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             use_mmap=settings.use_mmap,
             use_mlock=settings.use_mlock,
             kv_overrides=kv_overrides,
+            # Tokenizer Params (optionally for Functionary function calling)
+            hf_tokenizer_path=settings.hf_tokenizer_path,
             # Context Params
             seed=settings.seed,
             n_ctx=settings.n_ctx,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 9f0dc8a73..a777ef71e 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -56,6 +56,11 @@ class ModelSettings(BaseSettings):
         default=None,
         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
     )
+    # Tokenizer Params
+    hf_tokenizer_path: Optional[str] = Field(
+        default=None,
+        description="Override llama.cpp tokenizer with HF AutoTokenizer from this path if provided.",
+    )
     # Context Params
     seed: int = Field(
         default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."

From 43b452926913e94dc970cbbd436b7dcda2b05692 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Wed, 10 Jan 2024 09:16:59 +0000
Subject: [PATCH 12/27] convert functionary-v1 chat handler to use hf
 autotokenizer

---
 llama_cpp/llama_chat_format.py | 86 +++++++++++-----------------------
 1 file changed, 28 insertions(+), 58 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 4bc4a6c97..d9cb1eeab 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -978,8 +978,8 @@ def format_saiga(
 
 # Tricky chat formats that require custom chat handlers
 
-@register_chat_completion_handler("functionary")
-def functionary_chat_handler(
+@register_chat_completion_handler("functionary-v1")
+def functionary_v1_chat_handler(
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
@@ -1008,6 +1008,12 @@ def functionary_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+    END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
+    END_USER_TOKEN = "<|END_OF_USER|>"
+    END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
+    END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
+    START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
+    END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
 
     def generate_type_definition(
         param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
@@ -1079,22 +1085,23 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
             parameters = function.get("parameters", {})
             required_params = parameters.get("required", [])
 
-            schema += f"  // {description}\n"
-            schema += f"  type {function_name} = (_: {{\n"
+            schema += f"// {description}\n"
+            schema += f"type {function_name} = (_: {{\n"
 
             for param_name, param in parameters.get("properties", {}).items():
                 param_description = param.get("description", "")
                 param_type = generate_type_definition(param, 2, shared_definitions)
                 optional_indicator = "" if param_name in required_params else "?"
-                schema += f"    // {param_description}\n"
-                schema += f"    {param_name}{optional_indicator}: {param_type},\n"
-            schema += "  }) => any;\n\n"
+                schema += f"// {param_description}\n"
+                schema += f"{param_name}{optional_indicator}: {param_type},\n"
+            schema += "}) => any;\n\n"
 
-        schema += "}} // namespace {}\n".format(namespace)
+        schema += "}} // namespace {}".format(namespace)
         return schema
 
     def prepare_messages_for_inference(
         messages: List[llama_types.ChatCompletionRequestMessage],
+        tokenizer: AutoTokenizer,
         functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
     ):
@@ -1105,8 +1112,7 @@ def prepare_messages_for_inference(
                     role="system", content=generate_schema_from_functions(functions)
                 )
             )
-
-        if tools is not None:
+        elif tools is not None:
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
                     role="system",
@@ -1136,49 +1142,8 @@ def prepare_messages_for_inference(
                     "name"
                 ] = f"functions.{message['function_call']['name']}"
             all_messages.append(message)
-
-        all_messages.append(
-            llama_types.ChatCompletionRequestAssistantMessage(
-                role="assistant", content=None
-            )
-        )
-
-        def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
-            if msg["role"] == "system":
-                return f"system:\n{msg['content']}\n"
-
-            elif msg["role"] == "function" and "name" in msg:
-                return f"function name={msg['name']}:\n{msg['content']}\n"
-            elif msg["role"] == "function" and "function_call" in msg:
-                return f"function name={msg['function_call']['name']}:\n{msg['function_call']['arguments']}\n"
-            elif msg["role"] == "tool":
-                if msg["content"] is not None:
-                    return f"function name={msg['tool_call_id']}:\n{msg['content']}\n"
-                else:
-                    return f"function name={msg['tool_call_id']}\n"
-            elif msg["role"] == "user":
-                if msg["content"] is None:
-                    return "user:\n</s></s>\n"
-                else:
-                    return f"user:\n</s>{msg['content']}</s>\n"
-            elif msg["role"] == "assistant":
-                if msg["content"] is not None and "function_call" in msg:
-                    return f"assistant:\n{msg['content']}\nassistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
-                elif "function_call" in msg:
-                    return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
-                elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
-                    for tool_call in msg[
-                        "tool_calls"
-                    ]:  # NOTE: probably doesn't work with the functionary model
-                        return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
-                elif msg["content"] is None:
-                    return "assistant"
-                else:
-                    return f"assistant:\n{msg['content']}\n"
-            else:
-                raise ValueError(f"Unsupported role: {msg['role']}")
-
-        return "".join([message_to_str(msg) for msg in all_messages])
+        
+        return tokenizer.apply_chat_template(all_messages, tokenize=False) + "assistant:\n"
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1187,19 +1152,24 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         function_call = (
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
+        
+    from transformers import AutoTokenizer
+    
+    tokenizer_path = os.path.dirname(llama.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 
-    prompt = prepare_messages_for_inference(messages, functions, tools)
+    prompt = prepare_messages_for_inference(messages, tokenizer, functions, tools)
 
     if function_call is None and (functions is None or len(functions) == 0):
         completion_or_completion_chunks = llama.create_completion(
-            prompt=prompt + ":\n",
+            prompt=prompt,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stop=["user:", "</s>"],
+            stop=["user:", END_ASSISTANT_TOKEN],
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
@@ -1217,9 +1187,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
     ):
-        stop = "\n"
+        stop = [END_ASSISTANT_TOKEN, END_FUNCTION_CALL_TOKEN]
         completion: llama_types.Completion = llama.create_completion(
-            prompt=prompt, stop=stop, stream=False
+            prompt=prompt + ":\n", stop=stop, stream=False, max_tokens=max_tokens
         )  # type: ignore
         completion_text = completion["choices"][0]["text"]
         # strip " to=functions." and ending ":"

From c9c69478363ce18736965a71c7bf9492f4523d8c Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 08:46:02 +0000
Subject: [PATCH 13/27] add hf_tokenizer + inteegrate functionary-v1.4 prompt
 template

---
 llama_cpp/llama.py             |  66 ++++++++++------
 llama_cpp/llama_chat_format.py | 136 +++++++++++++++++++++------------
 2 files changed, 129 insertions(+), 73 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index b5618c10d..345ced850 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -14,6 +14,7 @@
     Iterator,
     Deque,
     Callable,
+    Any,
 )
 from collections import deque
 
@@ -783,11 +784,24 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        hf_tokenizer: Optional[Any] = None,
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
+        
+        def tokenize(input, hf_tokenizer):
+            if hf_tokenizer is not None:
+                return hf_tokenizer.encode(input)
+            else:
+                return self.tokenize(input.encode("utf-8"), special=True)
+            
+        def detokenize(tokens, hf_tokenizer):
+            if hf_tokenizer is not None:
+                return hf_tokenizer.decode(tokens).encode("utf-8")
+            else:
+                return self.detokenize(tokens)
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
@@ -797,7 +811,7 @@ def _create_completion(
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens: List[int] = (
             (
-                self.tokenize(prompt.encode("utf-8"), special=True)
+                tokenize(prompt, hf_tokenizer)
                 if prompt != ""
                 else [self.token_bos()]
             )
@@ -903,13 +917,13 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             if token == self._token_eos:
-                text = self.detokenize(completion_tokens)
+                text = detokenize(completion_tokens, hf_tokenizer)
                 finish_reason = "stop"
                 break
 
             completion_tokens.append(token)
 
-            all_text = self.detokenize(completion_tokens)
+            all_text = detokenize(completion_tokens, hf_tokenizer)
 
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
@@ -933,7 +947,7 @@ def logit_bias_processor(
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_text = self.detokenize(remaining_tokens)
+                remaining_text = detokenize(remaining_tokens, hf_tokenizer)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -955,17 +969,17 @@ def logit_bias_processor(
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(self.detokenize([token]))
+                        token_end_position += len(detokenize([token], hf_tokenizer))
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = self.detokenize([token]).decode(
+                        token_str = detokenize([token], hf_tokenizer).decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            self.detokenize(completion_tokens[:returned_tokens]).decode(
+                            detokenize(completion_tokens[:returned_tokens], hf_tokenizer).decode(
                                 "utf-8", errors="ignore"
                             )
                         )
@@ -979,7 +993,7 @@ def logit_bias_processor(
                             )
                         )
                         top_logprob = {
-                            self.detokenize([i]).decode(
+                            detokenize([i], hf_tokenizer).decode(
                                 "utf-8", errors="ignore"
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
@@ -987,7 +1001,7 @@ def logit_bias_processor(
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
                             "tokens": [
-                                self.detokenize([token]).decode(
+                                detokenize([token], hf_tokenizer).decode(
                                     "utf-8", errors="ignore"
                                 )
                             ],
@@ -1003,7 +1017,7 @@ def logit_bias_processor(
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": self.detokenize([token]).decode(
+                                    "text": detokenize([token], hf_tokenizer).decode(
                                         "utf-8", errors="ignore"
                                     ),
                                     "index": 0,
@@ -1017,7 +1031,7 @@ def logit_bias_processor(
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = self.detokenize(remaining_tokens[:i])
+                                bs = detokenize(remaining_tokens[:i], hf_tokenizer)
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1052,14 +1066,14 @@ def logit_bias_processor(
                         }
 
             if len(completion_tokens) >= max_tokens:
-                text = self.detokenize(completion_tokens)
+                text = detokenize(completion_tokens, hf_tokenizer)
                 finish_reason = "length"
                 break
 
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
-            text = self.detokenize(completion_tokens)
+            text = detokenize(completion_tokens, hf_tokenizer)
             finish_reason = "stop"
 
         if self.verbose:
@@ -1067,7 +1081,7 @@ def logit_bias_processor(
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
-            all_text = self.detokenize(remaining_tokens)
+            all_text = detokenize(remaining_tokens, hf_tokenizer)
             any_stop = [s for s in stop_sequences if s in all_text]
             if len(any_stop) > 0:
                 end = min(all_text.index(stop) for stop in any_stop)
@@ -1076,17 +1090,17 @@ def logit_bias_processor(
 
             token_end_position = 0
             for token in remaining_tokens:
-                token_end_position += len(self.detokenize([token]))
+                token_end_position += len(detokenize([token], hf_tokenizer))
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
                 if logprobs is not None:
                     if token == self.token_bos():
                         continue
-                    token_str = self.detokenize([token]).decode(
+                    token_str = detokenize([token], hf_tokenizer).decode(
                         "utf-8", errors="ignore"
                     )
                     text_offset = len(prompt) + len(
-                        self.detokenize(completion_tokens[:returned_tokens])
+                        detokenize(completion_tokens[:returned_tokens], hf_tokenizer)
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
@@ -1098,13 +1112,13 @@ def logit_bias_processor(
                         )
                     )
                     top_logprob = {
-                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                        detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
                     }
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                            detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1112,7 +1126,7 @@ def logit_bias_processor(
                     }
 
                 if token_end_position >= end:
-                    last_text = self.detokenize([token])
+                    last_text = detokenize([token], hf_tokenizer)
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
@@ -1141,7 +1155,7 @@ def logit_bias_processor(
                     "model": model_name,
                     "choices": [
                         {
-                            "text": self.detokenize([token]).decode(
+                            "text": detokenize([token], hf_tokenizer).decode(
                                 "utf-8", errors="ignore"
                             ),
                             "index": 0,
@@ -1200,7 +1214,7 @@ def logit_bias_processor(
                 all_tokens = completion_tokens
 
             all_token_strs = [
-                self.detokenize([token]).decode("utf-8", errors="ignore")
+                detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
@@ -1213,7 +1227,7 @@ def logit_bias_processor(
                 text_offsets.append(
                     text_offset
                     + len(
-                        self.detokenize(all_tokens[:idx]).decode(
+                        detokenize(all_tokens[:idx], hf_tokenizer).decode(
                             "utf-8", errors="ignore"
                         )
                     )
@@ -1226,7 +1240,7 @@ def logit_bias_processor(
                 )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
-                    self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                    detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
                 top_logprob.update({token_str: logprobs_token[int(token)]})
@@ -1291,6 +1305,7 @@ def create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        hf_tokenizer: Optional[Any] = None,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1354,6 +1369,7 @@ def create_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            hf_tokenizer=hf_tokenizer,
         )
         if stream:
             chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -1481,6 +1497,7 @@ def create_chat_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
+        hf_tokenizer_path: Optional[str] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1548,6 +1565,7 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            hf_tokenizer_path=hf_tokenizer_path,
         )
 
     def __getstate__(self):
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index d9cb1eeab..82d847667 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -978,8 +978,8 @@ def format_saiga(
 
 # Tricky chat formats that require custom chat handlers
 
-@register_chat_completion_handler("functionary-v1")
-def functionary_v1_chat_handler(
+@register_chat_completion_handler("functionary")
+def functionary_chat_handler(
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
@@ -1005,6 +1005,7 @@ def functionary_v1_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    hf_tokenizer_path: Optional[str] = None,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -1153,13 +1154,14 @@ def prepare_messages_for_inference(
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
         
+    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
     from transformers import AutoTokenizer
     
-    tokenizer_path = os.path.dirname(llama.model_path)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
 
     prompt = prepare_messages_for_inference(messages, tokenizer, functions, tools)
-
+    
+    # If no tools/functions are provided
     if function_call is None and (functions is None or len(functions) == 0):
         completion_or_completion_chunks = llama.create_completion(
             prompt=prompt,
@@ -1181,39 +1183,23 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            hf_tokenizer=tokenizer
         )
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
-
-    if function_call is None or (
-        isinstance(function_call, str) and function_call == "auto"
-    ):
-        stop = [END_ASSISTANT_TOKEN, END_FUNCTION_CALL_TOKEN]
-        completion: llama_types.Completion = llama.create_completion(
-            prompt=prompt + ":\n", stop=stop, stream=False, max_tokens=max_tokens
-        )  # type: ignore
-        completion_text = completion["choices"][0]["text"]
-        # strip " to=functions." and ending ":"
-        function_call = completion_text.split(".")[-1][:-1]
-        new_prompt = prompt + completion_text + stop
-    elif isinstance(function_call, str) and function_call != "none":
-        new_prompt = prompt + f":\n"
-    elif isinstance(function_call, dict):
-        new_prompt = prompt + f" to=functions.{function_call['name']}:\n"
-        function_call = function_call["name"]
-    else:
-        new_prompt = prompt + f":\n"
-
-    function_body = None
-    for function in functions or []:
-        if function["name"] == function_call:
-            function_body = function["parameters"]
-            break
-    for tool in tools or []:
-        if tool["type"] == "function" and tool["function"]["name"] == function_call:
-            function_body = tool["function"]["parameters"]
-            break
-
-    if function_body is not None:
+    
+    assert stream is False  # TODO: support stream mode
+    
+    def get_grammar(function_call):
+        function_body = None
+        for function in functions or []:
+            if function["name"] == function_call:
+                function_body = function["parameters"]
+                break
+        for tool in tools or []:
+            if tool["type"] == "function" and tool["function"]["name"] == function_call:
+                function_body = tool["function"]["parameters"]
+                break
+            
         try:
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar_text = llama_grammar.json_schema_to_gbnf(
@@ -1233,21 +1219,38 @@ def prepare_messages_for_inference(
                 grammar = llama_grammar.LlamaGrammar.from_string(
                     llama_grammar.JSON_GBNF
                 )
+        
+        return grammar
+    
+    # If no or "auto" tool_choice/function_call
+    if function_call is None or (
+        isinstance(function_call, str) and function_call == "auto"
+    ):
+        stops = ["\n", END_ASSISTANT_TOKEN]
+    # If tool_choice/function_call is "none"
+    elif isinstance(function_call, str) and function_call == "none":
+        prompt = prepare_messages_for_inference(messages, tokenizer, [], [])
+        stops = END_ASSISTANT_TOKEN
+    # If tool_choice/function_call is provided
+    elif isinstance(function_call, dict):
+        prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+        function_call = function_call["name"]
+        stops = END_FUNCTION_CALL_TOKEN
+        grammar = get_grammar(function_call)
     else:
-        with suppress_stdout_stderr(disable=llama.verbose):
-            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
-
+        prompt = prompt
+        stops = ["\n", END_ASSISTANT_TOKEN]
+        
     completion: llama_types.Completion = llama.create_completion(
-        prompt=new_prompt,
-        stop=["user:", "</s>"],
-        stream=False,
-        grammar=grammar,
-        max_tokens=max_tokens,
+        prompt=prompt,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
         min_p=min_p,
         typical_p=typical_p,
+        stream=stream,
+        stop=stops,
+        max_tokens=max_tokens,
         presence_penalty=presence_penalty,
         frequency_penalty=frequency_penalty,
         repeat_penalty=repeat_penalty,
@@ -1257,11 +1260,46 @@ def prepare_messages_for_inference(
         mirostat_eta=mirostat_eta,
         model=model,
         logits_processor=logits_processor,
-    )  # type: ignore
-
+        grammar=grammar,
+        hf_tokenizer=tokenizer,
+    )
+    completion_text = completion["choices"][0]["text"]
+    
+    # If the generation does not involve a function call
+    if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+        return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+    elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
+        new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+        function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+        grammar = get_grammar(function_call)
+        
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=new_prompt,
+            stop=END_FUNCTION_CALL_TOKEN,
+            stream=stream,
+            grammar=grammar,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            hf_tokenizer=tokenizer,
+        )  # type: ignore
+    else:
+        new_prompt = prompt
+    
     assert "usage" in completion
     assert isinstance(function_call, str)
-    assert stream is False  # TODO: support stream mode
 
     if llama.verbose:
         print(new_prompt)
@@ -1281,7 +1319,7 @@ def prepare_messages_for_inference(
                     "content": None,
                     "function_call": {
                         "name": function_call,
-                        "arguments": completion["choices"][0]["text"],
+                        "arguments": completion["choices"][0]["text"].strip(),
                     },
                     "tool_calls": [
                         {
@@ -1289,7 +1327,7 @@ def prepare_messages_for_inference(
                             "type": "function",
                             "function": {
                                 "name": function_call,
-                                "arguments": completion["choices"][0]["text"],
+                                "arguments": completion["choices"][0]["text"].strip(),
                             },
                         }
                     ],

From f912c6240bfac636d5a4bdfedee4fa44efc87fde Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 10:56:54 +0000
Subject: [PATCH 14/27] integrate functionary v2 prompt template

---
 llama_cpp/llama_chat_format.py | 191 ++++++++++++++++++++++++---------
 1 file changed, 139 insertions(+), 52 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 82d847667..54dd44203 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,7 +4,7 @@
 import json
 import ctypes
 import dataclasses
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
 
 import jinja2
 
@@ -1009,12 +1009,26 @@ def functionary_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
-    END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
-    END_USER_TOKEN = "<|END_OF_USER|>"
-    END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
-    END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
-    START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
-    END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
+    
+    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
+    from transformers import AutoTokenizer
+    
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+    
+    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.additional_special_tokens:
+        version = "v1"
+        END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
+        END_USER_TOKEN = "<|END_OF_USER|>"
+        END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
+        END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
+        START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
+        END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
+    else:
+        version = "v2"
+        RECIPIENT_TOKEN = "<|recipient|>"
+        FROM_TOKEN = "<|from|>"
+        STOP_TOKEN = "<|stop|>"
+        CONTENT_TOKEN = "<|content|>"
 
     def generate_type_definition(
         param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
@@ -1103,6 +1117,7 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
     def prepare_messages_for_inference(
         messages: List[llama_types.ChatCompletionRequestMessage],
         tokenizer: AutoTokenizer,
+        version: Literal["v1", "v2"],
         functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
     ):
@@ -1143,8 +1158,13 @@ def prepare_messages_for_inference(
                     "name"
                 ] = f"functions.{message['function_call']['name']}"
             all_messages.append(message)
+            
+        if version == "v1":
+            suffix = "assistant:\n"
+        else:
+            suffix = "<|from|>assistant\n<|recipient|>"
         
-        return tokenizer.apply_chat_template(all_messages, tokenize=False) + "assistant:\n"
+        return tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1153,16 +1173,17 @@ def prepare_messages_for_inference(
         function_call = (
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
-        
-    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
-    from transformers import AutoTokenizer
-    
-    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
 
-    prompt = prepare_messages_for_inference(messages, tokenizer, functions, tools)
+    prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools)
     
     # If no tools/functions are provided
     if function_call is None and (functions is None or len(functions) == 0):
+        if version == "v1":
+            stop = END_ASSISTANT_TOKEN
+        else:
+            stop = STOP_TOKEN
+            prompt += "all\n<|content|>"
+        
         completion_or_completion_chunks = llama.create_completion(
             prompt=prompt,
             temperature=temperature,
@@ -1171,7 +1192,7 @@ def prepare_messages_for_inference(
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stop=["user:", END_ASSISTANT_TOKEN],
+            stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
@@ -1226,20 +1247,24 @@ def get_grammar(function_call):
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
     ):
-        stops = ["\n", END_ASSISTANT_TOKEN]
+        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else CONTENT_TOKEN
     # If tool_choice/function_call is "none"
     elif isinstance(function_call, str) and function_call == "none":
-        prompt = prepare_messages_for_inference(messages, tokenizer, [], [])
-        stops = END_ASSISTANT_TOKEN
+        prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+        stops = END_ASSISTANT_TOKEN if version == "v1" else STOP_TOKEN
     # If tool_choice/function_call is provided
     elif isinstance(function_call, dict):
-        prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+        if version == "v1":
+            prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+            stops = END_FUNCTION_CALL_TOKEN
+        else:
+            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+            stops = STOP_TOKEN
         function_call = function_call["name"]
-        stops = END_FUNCTION_CALL_TOKEN
         grammar = get_grammar(function_call)
     else:
         prompt = prompt
-        stops = ["\n", END_ASSISTANT_TOKEN]
+        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
         
     completion: llama_types.Completion = llama.create_completion(
         prompt=prompt,
@@ -1265,38 +1290,100 @@ def get_grammar(function_call):
     )
     completion_text = completion["choices"][0]["text"]
     
-    # If the generation does not involve a function call
-    if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
-        return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-    elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
-        new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
-        function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
-        grammar = get_grammar(function_call)
-        
-        completion: llama_types.Completion = llama.create_completion(
-            prompt=new_prompt,
-            stop=END_FUNCTION_CALL_TOKEN,
-            stream=stream,
-            grammar=grammar,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=min_p,
-            typical_p=typical_p,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            repeat_penalty=repeat_penalty,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            model=model,
-            logits_processor=logits_processor,
-            hf_tokenizer=tokenizer,
-        )  # type: ignore
+    if version == "v1":
+        # If the generation does not involve a function call
+        if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
+            new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+            function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+            grammar = get_grammar(function_call)
+            
+            completion: llama_types.Completion = llama.create_completion(
+                prompt=new_prompt,
+                stop=END_FUNCTION_CALL_TOKEN,
+                stream=stream,
+                grammar=grammar,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                hf_tokenizer=tokenizer,
+            )  # type: ignore
+        else:
+            new_prompt = prompt
     else:
-        new_prompt = prompt
+        # If the generation does not involve a function call
+        if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+            new_prompt = prompt + completion_text + CONTENT_TOKEN
+            
+            completion: llama_types.Completion = llama.create_completion(
+                prompt=new_prompt,
+                stop=STOP_TOKEN,
+                stream=stream,
+                grammar=grammar,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                hf_tokenizer=tokenizer,
+            )  # type: ignore
+            
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        elif not prompt.endswith(CONTENT_TOKEN):
+            new_prompt = prompt + completion_text + CONTENT_TOKEN
+            function_call = completion_text[:-1].strip()
+            grammar = get_grammar(function_call)
+        
+            completion: llama_types.Completion = llama.create_completion(
+                prompt=new_prompt,
+                stop=STOP_TOKEN,
+                stream=stream,
+                grammar=grammar,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                hf_tokenizer=tokenizer,
+            )  # type: ignore
+        else:
+            new_prompt = prompt
+            
     
     assert "usage" in completion
     assert isinstance(function_call, str)

From 3b5fe39288723acfe5a7aed6eba0dee2339dcfa2 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 11:16:19 +0000
Subject: [PATCH 15/27] update readme

---
 README.md          | 8 +++++---
 llama_cpp/llama.py | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0a77bbdaa..bdd45704b 100644
--- a/README.md
+++ b/README.md
@@ -293,8 +293,9 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
 
 The high-level API also provides a simple interface for function calling.
 
-Note that the only model that supports full function calling at this time is "functionary".
-The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
+Note that the only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary).
+The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai).
+Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
 
 ```python
 >>> from llama_cpp import Llama
@@ -337,7 +338,8 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         "function": {
           "name": "UserDetail"
         }
-      }]
+      }],
+      hf_tokenizer_path="path/to/functionary-gguf/"
 )
 ```
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 345ced850..601ea4213 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1335,6 +1335,7 @@ def create_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use for constrained sampling.
             logit_bias: A logit bias to use.
+            hf_tokenizer: A HuggingFace AutoTokenizer to use optionally.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1530,6 +1531,7 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
+            hf_tokenizer_path: A HuggingFace AutoTokenizer file path to use.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.

From 4dd6b62340daa971f49eb4b3b8935343e5eebe9b Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 11 Jan 2024 13:23:36 +0000
Subject: [PATCH 16/27] set up parallel function calling wip

---
 llama_cpp/llama_chat_format.py | 51 +++++++++++++++-------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 54dd44203..b20db81aa 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1247,7 +1247,7 @@ def get_grammar(function_call):
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
     ):
-        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else CONTENT_TOKEN
+        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
     # If tool_choice/function_call is "none"
     elif isinstance(function_call, str) and function_call == "none":
         prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
@@ -1354,39 +1354,32 @@ def get_grammar(function_call):
             )  # type: ignore
             
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        elif not prompt.endswith(CONTENT_TOKEN):
-            new_prompt = prompt + completion_text + CONTENT_TOKEN
-            function_call = completion_text[:-1].strip()
-            grammar = get_grammar(function_call)
-        
-            completion: llama_types.Completion = llama.create_completion(
-                prompt=new_prompt,
-                stop=STOP_TOKEN,
-                stream=stream,
-                grammar=grammar,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                hf_tokenizer=tokenizer,
-            )  # type: ignore
+        elif prompt.endswith(RECIPIENT_TOKEN):
+            all_calls = completion_text.split(f"\n{FROM_TOKEN} assistant\n{RECIPIENT_TOKEN}")
+            function_calls = [curr_call.split(f"\n{CONTENT_TOKEN}")[0].strip() for curr_call in all_calls]
+            function_bodies = [curr_call.split(f"\n{CONTENT_TOKEN}")[1].strip() for curr_call in all_calls]
+            breakpoint()
         else:
             new_prompt = prompt
             
     
     assert "usage" in completion
-    assert isinstance(function_call, str)
+    if function_call is not None:
+        assert isinstance(function_call, str)
+        tool_calls = 
+    else:
+        tool_calls = []
+        for function_call, function_body in zip(function_calls, function_bodies):
+            tool_calls.append(
+                {
+                    "id": function_call,
+                    "type": "function",
+                    "function": {
+                        "name": function_call,
+                        "arguments": function_body,
+                    },
+                }
+            )
 
     if llama.verbose:
         print(new_prompt)

From 03b68fe32da08e916217770caf976ebe69946a92 Mon Sep 17 00:00:00 2001
From: jeffrey-fong <jeffreyfong94@gmail.com>
Date: Wed, 31 Jan 2024 08:35:40 +0800
Subject: [PATCH 17/27] resolve merge conflict

---
 README.md                      |  11 +-
 llama_cpp/llama_chat_format.py | 273 ++++++++++++++++-----------------
 2 files changed, 136 insertions(+), 148 deletions(-)

diff --git a/README.md b/README.md
index bdd45704b..c84dec624 100644
--- a/README.md
+++ b/README.md
@@ -293,20 +293,16 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
 
 The high-level API also provides a simple interface for function calling.
 
-Note that the only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary).
-The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai).
+The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**.
+
 Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
 
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
 >>> llm.create_chat_completion(
+      hf_tokenizer_path="path/to/functionary-gguf/"
       messages = [
-        {
-          "role": "system",
-          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
-
-        },
         {
           "role": "user",
           "content": "Extract Jason is 25 years old"
@@ -339,7 +335,6 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
           "name": "UserDetail"
         }
       }],
-      hf_tokenizer_path="path/to/functionary-gguf/"
 )
 ```
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index b20db81aa..5a7dd50db 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,6 +4,8 @@
 import json
 import ctypes
 import dataclasses
+import random
+import string
 from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
 
 import jinja2
@@ -1243,147 +1245,147 @@ def get_grammar(function_call):
         
         return grammar
     
-    # If no or "auto" tool_choice/function_call
-    if function_call is None or (
-        isinstance(function_call, str) and function_call == "auto"
-    ):
-        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
-    # If tool_choice/function_call is "none"
-    elif isinstance(function_call, str) and function_call == "none":
-        prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
-        stops = END_ASSISTANT_TOKEN if version == "v1" else STOP_TOKEN
-    # If tool_choice/function_call is provided
-    elif isinstance(function_call, dict):
-        if version == "v1":
+    def create_completion(stop):
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            hf_tokenizer=tokenizer,
+        )
+        
+        return completion
+    
+    function_calls, function_bodies = [], []
+    
+    if version == "v1":
+        # If no or "auto" tool_choice/function_call
+        if function_call is None or (
+            isinstance(function_call, str) and function_call == "auto"
+        ):
+            stops = ["\n", END_ASSISTANT_TOKEN]
+        # If tool_choice/function_call is "none"
+        elif isinstance(function_call, str) and function_call == "none":
+            prompt = prepare_messages_for_inference(messages, tokenizer, version, [], [])
+            stops = END_ASSISTANT_TOKEN
+        # If tool_choice/function_call is provided
+        elif isinstance(function_call, dict):
             prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
             stops = END_FUNCTION_CALL_TOKEN
+            function_call = function_call["name"]
+            function_calls.append(function_call)
+            grammar = get_grammar(function_call)
         else:
-            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
-            stops = STOP_TOKEN
-        function_call = function_call["name"]
-        grammar = get_grammar(function_call)
-    else:
-        prompt = prompt
-        stops = ["\n", END_ASSISTANT_TOKEN] if version == "v1" else STOP_TOKEN
+            prompt = prompt
+            stops = ["\n", END_ASSISTANT_TOKEN]
+            
+        completion = create_completion(stop=stops)
+        completion_text = completion["choices"][0]["text"]
         
-    completion: llama_types.Completion = llama.create_completion(
-        prompt=prompt,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        min_p=min_p,
-        typical_p=typical_p,
-        stream=stream,
-        stop=stops,
-        max_tokens=max_tokens,
-        presence_penalty=presence_penalty,
-        frequency_penalty=frequency_penalty,
-        repeat_penalty=repeat_penalty,
-        tfs_z=tfs_z,
-        mirostat_mode=mirostat_mode,
-        mirostat_tau=mirostat_tau,
-        mirostat_eta=mirostat_eta,
-        model=model,
-        logits_processor=logits_processor,
-        grammar=grammar,
-        hf_tokenizer=tokenizer,
-    )
-    completion_text = completion["choices"][0]["text"]
-    
-    if version == "v1":
         # If the generation does not involve a function call
         if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        # If the generation involves a function call in completion, generate the parameters
         elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
-            new_prompt = prompt + completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
-            function_call = completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
-            grammar = get_grammar(function_call)
-            
-            completion: llama_types.Completion = llama.create_completion(
-                prompt=new_prompt,
-                stop=END_FUNCTION_CALL_TOKEN,
-                stream=stream,
-                grammar=grammar,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                hf_tokenizer=tokenizer,
-            )  # type: ignore
+            prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+            function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip())
+            grammar = get_grammar(function_calls[-1])
+            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+            function_bodies.append(completion["choices"][0]["text"].strip())
+        # If the prompt involves a function call, just append generated parameters to function_bodies
         else:
-            new_prompt = prompt
+            function_bodies.append(completion_text.strip())
     else:
-        # If the generation does not involve a function call
-        if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
-            new_prompt = prompt + completion_text + CONTENT_TOKEN
-            
-            completion: llama_types.Completion = llama.create_completion(
-                prompt=new_prompt,
-                stop=STOP_TOKEN,
-                stream=stream,
-                grammar=grammar,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                hf_tokenizer=tokenizer,
-            )  # type: ignore
+        # Loop until all parallel function calls are generated
+        while True:
+            # If no or "auto" tool_choice/function_call
+            if function_call is None or (
+                isinstance(function_call, str) and function_call == "auto"
+            ):
+                grammar = None
+                stops = CONTENT_TOKEN
+            # If tool_choice/function_call is "none"
+            elif isinstance(function_call, str) and function_call == "none":
+                prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+                stops = STOP_TOKEN
+            # If tool_choice/function_call is provided
+            elif isinstance(function_call, dict):
+                prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+                stops = STOP_TOKEN
+                function_call = function_call["name"]
+                function_calls.append(function_call)
+                grammar = get_grammar(function_call)
+            else:
+                prompt = prompt
+                stops = STOP_TOKEN
+                
+            completion = create_completion(stop=stops)
+            completion_text = completion["choices"][0]["text"]
             
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-        elif prompt.endswith(RECIPIENT_TOKEN):
-            all_calls = completion_text.split(f"\n{FROM_TOKEN} assistant\n{RECIPIENT_TOKEN}")
-            function_calls = [curr_call.split(f"\n{CONTENT_TOKEN}")[0].strip() for curr_call in all_calls]
-            function_bodies = [curr_call.split(f"\n{CONTENT_TOKEN}")[1].strip() for curr_call in all_calls]
-            breakpoint()
-        else:
-            new_prompt = prompt
+            # If the generation does not involve a function call
+            if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # Generate model response if the model decides not to call any function
+            elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+                prompt += completion_text + CONTENT_TOKEN
+                completion = create_completion(stop=STOP_TOKEN)
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # Generate parameters if model decides to call a function
+            elif prompt.endswith(RECIPIENT_TOKEN):
+                function_calls.append(completion_text[:-1])
+                grammar = get_grammar(function_calls[-1])
+                completion = create_completion(stop=[STOP_TOKEN, "\n"])
+                function_bodies.append(completion["choices"][0]["text"].strip())
+                prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
+                grammar = None
+                
+                # Try to generate the beginning of next turn
+                # If empty completion, break from loop
+                next_turn_completion_text = create_completion(
+                    stop=[STOP_TOKEN, RECIPIENT_TOKEN]
+                )["choices"][0]["text"]
+                if len(next_turn_completion_text) > 0:
+                    prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
+                else:
+                    break
+            # Break from loop if tool_choice/function_call is provided as a dict
+            else:
+                function_bodies.append(completion_text.strip())
+                break
             
-    
     assert "usage" in completion
-    if function_call is not None:
-        assert isinstance(function_call, str)
-        tool_calls = 
-    else:
-        tool_calls = []
-        for function_call, function_body in zip(function_calls, function_bodies):
-            tool_calls.append(
-                {
-                    "id": function_call,
-                    "type": "function",
-                    "function": {
-                        "name": function_call,
-                        "arguments": function_body,
-                    },
-                }
-            )
-
-    if llama.verbose:
-        print(new_prompt)
-        print(completion["choices"][0]["text"])
+    assert len(function_calls) > 0
+    assert len(function_calls) == len(function_bodies)
+    
+    tool_calls = []
+    for function_call, function_body in zip(function_calls, function_bodies):
+        tool_calls.append(
+            {
+                "id": "call_" + "".join(
+                    [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                ),
+                "type": "function",
+                "function": {
+                    "name": function_call,
+                    "arguments": function_body,
+                },
+            }
+        )
 
     # TODO: support stream mode
     return llama_types.CreateChatCompletionResponse(
@@ -1398,19 +1400,10 @@ def get_grammar(function_call):
                     "role": "assistant",
                     "content": None,
                     "function_call": {
-                        "name": function_call,
-                        "arguments": completion["choices"][0]["text"].strip(),
+                        "name": tool_calls[0]["function"]["name"],
+                        "arguments": tool_calls[0]["function"]["arguments"],
                     },
-                    "tool_calls": [
-                        {
-                            "id": function_call,
-                            "type": "function",
-                            "function": {
-                                "name": function_call,
-                                "arguments": completion["choices"][0]["text"].strip(),
-                            },
-                        }
-                    ],
+                    "tool_calls": tool_calls,
                 },
                 "finish_reason": "tool_calls",
             }

From 2957baf8603d83f601ee5b4851337556171c37dd Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Fri, 12 Jan 2024 12:01:15 +0800
Subject: [PATCH 18/27] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c84dec624..9f0ca86fb 100644
--- a/README.md
+++ b/README.md
@@ -301,7 +301,7 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
 >>> llm.create_chat_completion(
-      hf_tokenizer_path="path/to/functionary-gguf/"
+      hf_tokenizer_path="path/to/functionary-gguf/",
       messages = [
         {
           "role": "user",

From 7a98b04e0174dfe6a177b047a2f533ab147ff283 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Fri, 12 Jan 2024 12:03:41 +0800
Subject: [PATCH 19/27] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9f0ca86fb..02d1b4d5f 100644
--- a/README.md
+++ b/README.md
@@ -329,12 +329,12 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
           }
         }
       }],
-      tool_choice=[{
+      tool_choice={
         "type": "function",
         "function": {
           "name": "UserDetail"
         }
-      }],
+      },
 )
 ```
 

From bc9447bf3493eb5027fa0ff5572486f468c61cca Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Tue, 23 Jan 2024 09:53:20 +0000
Subject: [PATCH 20/27] refactor tokenizers

---
 llama_cpp/llama.py             | 120 ++++++++++++++++++---------------
 llama_cpp/llama_chat_format.py |  14 ++--
 2 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 601ea4213..0ffbc678c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -15,6 +15,7 @@
     Deque,
     Callable,
     Any,
+    Protocol,
 )
 from collections import deque
 
@@ -61,6 +62,8 @@ def __init__(
         use_mmap: bool = True,
         use_mlock: bool = False,
         kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
+        # Tokenizer Params (Optionally for HF AutoTokenizers)
+        hf_tokenizer_path: Optional[str] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
@@ -131,6 +134,7 @@ def __init__(
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             kv_overrides: Key-value overrides for the model.
+            hf_tokenizer_path: Override llama.cpp tokenizer with HF AutoTokenizer from this path if provided.
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
@@ -228,6 +232,13 @@ def __init__(
         self.n_threads_batch = n_threads_batch or max(
             multiprocessing.cpu_count() // 2, 1
         )
+        
+        # Tokenizer Params
+        if hf_tokenizer_path is not None:
+            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
+        else:
+            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
+        
         # Context Params
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.seed = seed
@@ -422,7 +433,7 @@ def tokenize(
         Returns:
             A list of tokens.
         """
-        return self._model.tokenize(text, add_bos, special)
+        return self._tokenizer_to_use.encode(text, add_bos, special)
 
     def detokenize(self, tokens: List[int]) -> bytes:
         """Detokenize a list of tokens.
@@ -433,7 +444,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
         Returns:
             The detokenized string.
         """
-        return self._model.detokenize(tokens)
+        return self._tokenizer_to_use.decode(tokens)
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -784,24 +795,11 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
-        hf_tokenizer: Optional[Any] = None,
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
-        
-        def tokenize(input, hf_tokenizer):
-            if hf_tokenizer is not None:
-                return hf_tokenizer.encode(input)
-            else:
-                return self.tokenize(input.encode("utf-8"), special=True)
-            
-        def detokenize(tokens, hf_tokenizer):
-            if hf_tokenizer is not None:
-                return hf_tokenizer.decode(tokens).encode("utf-8")
-            else:
-                return self.detokenize(tokens)
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
@@ -811,7 +809,7 @@ def detokenize(tokens, hf_tokenizer):
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens: List[int] = (
             (
-                tokenize(prompt, hf_tokenizer)
+                self.tokenize(prompt.encode("utf-8"), special=True)
                 if prompt != ""
                 else [self.token_bos()]
             )
@@ -917,13 +915,13 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             if token == self._token_eos:
-                text = detokenize(completion_tokens, hf_tokenizer)
+                text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
                 break
 
             completion_tokens.append(token)
 
-            all_text = detokenize(completion_tokens, hf_tokenizer)
+            all_text = self.detokenize(completion_tokens)
 
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
@@ -947,7 +945,7 @@ def logit_bias_processor(
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_text = detokenize(remaining_tokens, hf_tokenizer)
+                remaining_text = self.detokenize(remaining_tokens)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -969,17 +967,17 @@ def logit_bias_processor(
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(detokenize([token], hf_tokenizer))
+                        token_end_position += len(self.detokenize([token]))
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = detokenize([token], hf_tokenizer).decode(
+                        token_str = self.detokenize([token]).decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            detokenize(completion_tokens[:returned_tokens], hf_tokenizer).decode(
+                            self.detokenize(completion_tokens[:returned_tokens]).decode(
                                 "utf-8", errors="ignore"
                             )
                         )
@@ -993,7 +991,7 @@ def logit_bias_processor(
                             )
                         )
                         top_logprob = {
-                            detokenize([i], hf_tokenizer).decode(
+                            self.detokenize([i]).decode(
                                 "utf-8", errors="ignore"
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
@@ -1001,7 +999,7 @@ def logit_bias_processor(
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
                             "tokens": [
-                                detokenize([token], hf_tokenizer).decode(
+                                self.detokenize([token]).decode(
                                     "utf-8", errors="ignore"
                                 )
                             ],
@@ -1017,7 +1015,7 @@ def logit_bias_processor(
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": detokenize([token], hf_tokenizer).decode(
+                                    "text": self.detokenize([token]).decode(
                                         "utf-8", errors="ignore"
                                     ),
                                     "index": 0,
@@ -1031,7 +1029,7 @@ def logit_bias_processor(
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = detokenize(remaining_tokens[:i], hf_tokenizer)
+                                bs = self.detokenize(remaining_tokens[:i])
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1066,14 +1064,15 @@ def logit_bias_processor(
                         }
 
             if len(completion_tokens) >= max_tokens:
-                text = detokenize(completion_tokens, hf_tokenizer)
+                text = self.detokenize(completion_tokens)
+                
                 finish_reason = "length"
                 break
 
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
-            text = detokenize(completion_tokens, hf_tokenizer)
+            text = self.detokenize(completion_tokens)
             finish_reason = "stop"
 
         if self.verbose:
@@ -1081,7 +1080,7 @@ def logit_bias_processor(
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
-            all_text = detokenize(remaining_tokens, hf_tokenizer)
+            all_text = self.detokenize(remaining_tokens)
             any_stop = [s for s in stop_sequences if s in all_text]
             if len(any_stop) > 0:
                 end = min(all_text.index(stop) for stop in any_stop)
@@ -1090,17 +1089,17 @@ def logit_bias_processor(
 
             token_end_position = 0
             for token in remaining_tokens:
-                token_end_position += len(detokenize([token], hf_tokenizer))
+                token_end_position += len(self.detokenize([token]))
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
                 if logprobs is not None:
                     if token == self.token_bos():
                         continue
-                    token_str = detokenize([token], hf_tokenizer).decode(
+                    token_str = self.detokenize([token]).decode(
                         "utf-8", errors="ignore"
                     )
                     text_offset = len(prompt) + len(
-                        detokenize(completion_tokens[:returned_tokens], hf_tokenizer)
+                        self.detokenize(completion_tokens[:returned_tokens])
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
@@ -1112,13 +1111,13 @@ def logit_bias_processor(
                         )
                     )
                     top_logprob = {
-                        detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
+                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
                     }
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
+                            self.detokenize([token]).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1126,7 +1125,7 @@ def logit_bias_processor(
                     }
 
                 if token_end_position >= end:
-                    last_text = detokenize([token], hf_tokenizer)
+                    last_text = self.detokenize([token])
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
@@ -1155,7 +1154,7 @@ def logit_bias_processor(
                     "model": model_name,
                     "choices": [
                         {
-                            "text": detokenize([token], hf_tokenizer).decode(
+                            "text": self.detokenize([token]).decode(
                                 "utf-8", errors="ignore"
                             ),
                             "index": 0,
@@ -1214,7 +1213,7 @@ def logit_bias_processor(
                 all_tokens = completion_tokens
 
             all_token_strs = [
-                detokenize([token], hf_tokenizer).decode("utf-8", errors="ignore")
+                self.detokenize([token]).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
@@ -1227,7 +1226,7 @@ def logit_bias_processor(
                 text_offsets.append(
                     text_offset
                     + len(
-                        detokenize(all_tokens[:idx], hf_tokenizer).decode(
+                        self.detokenize(all_tokens[:idx]).decode(
                             "utf-8", errors="ignore"
                         )
                     )
@@ -1240,7 +1239,7 @@ def logit_bias_processor(
                 )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
-                    detokenize([i], hf_tokenizer).decode("utf-8", errors="ignore"): logprob
+                    self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
                 top_logprob.update({token_str: logprobs_token[int(token)]})
@@ -1305,7 +1304,6 @@ def create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
-        hf_tokenizer: Optional[Any] = None,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1335,7 +1333,6 @@ def create_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use for constrained sampling.
             logit_bias: A logit bias to use.
-            hf_tokenizer: A HuggingFace AutoTokenizer to use optionally.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1370,7 +1367,6 @@ def create_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
-            hf_tokenizer=hf_tokenizer,
         )
         if stream:
             chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -1498,7 +1494,6 @@ def create_chat_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
-        hf_tokenizer_path: Optional[str] = None,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -1531,7 +1526,6 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
-            hf_tokenizer_path: A HuggingFace AutoTokenizer file path to use.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.
@@ -1567,7 +1561,6 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
-            hf_tokenizer_path=hf_tokenizer_path,
         )
 
     def __getstate__(self):
@@ -1711,9 +1704,9 @@ def n_vocab(self) -> int:
         """Return the vocabulary size."""
         return self._model.n_vocab()
 
-    def tokenizer(self) -> "LlamaTokenizer":
+    def tokenizer(self) -> Union["LlamaCppTokenizer", "HFTokenizer"]:
         """Return the tokenizer for this model."""
-        return LlamaTokenizer(self)
+        return self._tokenizer_to_use
 
     def token_eos(self) -> int:
         """Return the end-of-sequence token."""
@@ -1756,21 +1749,40 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         return longest_prefix
 
 
-class LlamaTokenizer:
+class LlamaTokenizer(Protocol):
+    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+        ...
+    
+    def decode(self, tokens: List[int]) -> bytes:
+        ...
+
+class LlamaCppTokenizer:
     def __init__(self, llama: Llama):
         self.llama = llama
 
-    def encode(self, text: str, add_bos: bool = True) -> List[int]:
-        return self.llama.tokenize(
-            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
-        )
+    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+        return self.llama._model.tokenize(text, add_bos, special)
 
-    def decode(self, tokens: List[int]) -> str:
-        return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
+    def decode(self, tokens: List[int]) -> bytes:
+        return self.llama._model.detokenize(tokens)
 
     @classmethod
     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
         return cls(Llama(model_path=path, vocab_only=True))
+    
+
+class HFTokenizer:
+    def __init__(self, hf_tokenizer_path):
+        from transformers import AutoTokenizer
+        self.hf_tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+        
+    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
+        return self.hf_tokenizer.encode(
+            text.decode("utf-8", errors="ignore"), add_special_tokens=special
+        )
+    
+    def decode(self, tokens: List[int]) -> bytes:
+        return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
 
 
 class LlamaState:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 5a7dd50db..3fd678875 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1007,17 +1007,15 @@ def functionary_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
-    hf_tokenizer_path: Optional[str] = None,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
     
-    assert hf_tokenizer_path is not None, "Please provide a valid hf tokenizer path from https://huggingface.co/meetkai"
+    tokenizer = llama.tokenizer()
+    assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
     from transformers import AutoTokenizer
     
-    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
-    
-    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.additional_special_tokens:
+    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
         version = "v1"
         END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
         END_USER_TOKEN = "<|END_OF_USER|>"
@@ -1166,7 +1164,7 @@ def prepare_messages_for_inference(
         else:
             suffix = "<|from|>assistant\n<|recipient|>"
         
-        return tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
+        return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1206,7 +1204,6 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
-            hf_tokenizer=tokenizer
         )
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
     
@@ -1266,7 +1263,6 @@ def create_completion(stop):
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
-            hf_tokenizer=tokenizer,
         )
         
         return completion
@@ -1293,7 +1289,7 @@ def create_completion(stop):
         else:
             prompt = prompt
             stops = ["\n", END_ASSISTANT_TOKEN]
-            
+
         completion = create_completion(stop=stops)
         completion_text = completion["choices"][0]["text"]
         

From 8d334dfea84fc0abf71c6d1737eb9dc963ff025d Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Tue, 23 Jan 2024 14:09:00 +0000
Subject: [PATCH 21/27] include old functionary handler for backward
 compatibility

---
 README.md                      |   5 +-
 llama_cpp/llama.py             |  17 +-
 llama_cpp/llama_chat_format.py | 354 +++++++++++++++++++++++++++++++++
 3 files changed, 365 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 02d1b4d5f..40f9621e8 100644
--- a/README.md
+++ b/README.md
@@ -293,15 +293,14 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
 
 The high-level API also provides a simple interface for function calling.
 
-The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**.
+The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
 
 Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
 
 ```python
 >>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
+>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", hf_tokenizer_path="path/to/functionary-gguf/", chat_format="functionary-v2")
 >>> llm.create_chat_completion(
-      hf_tokenizer_path="path/to/functionary-gguf/",
       messages = [
         {
           "role": "user",
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 0ffbc678c..5b73c4537 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -233,12 +233,6 @@ def __init__(
             multiprocessing.cpu_count() // 2, 1
         )
         
-        # Tokenizer Params
-        if hf_tokenizer_path is not None:
-            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
-        else:
-            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
-        
         # Context Params
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.seed = seed
@@ -290,6 +284,13 @@ def __init__(
         self._model = _LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
         )
+        
+        # Tokenizer Params
+        if hf_tokenizer_path is not None:
+            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
+        else:
+            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
+        
         # Set the default value for the context and correct the batch
         if n_ctx == 0:
             n_ctx = self._model.n_ctx_train()
@@ -1761,10 +1762,10 @@ def __init__(self, llama: Llama):
         self.llama = llama
 
     def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
-        return self.llama._model.tokenize(text, add_bos, special)
+        return self.llama.tokenize(text, add_bos, special)
 
     def decode(self, tokens: List[int]) -> bytes:
-        return self.llama._model.detokenize(tokens)
+        return self.llama.detokenize(tokens)
 
     @classmethod
     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 3fd678875..946955532 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1010,6 +1010,360 @@ def functionary_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+
+    def generate_type_definition(
+        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
+    ) -> str:
+        indent = "  " * indent_level
+        if "$ref" in param:
+            # Reference to a shared definition
+            ref_name = param["$ref"].split("/")[
+                -1
+            ]  # Extract the type name from the reference
+            return ref_name
+        elif param.get("type") == "array":
+            items = param.get("items", {})
+            item_type = generate_type_definition(items, indent_level + 1, shared_defs)
+            return f"Array<{item_type}>"
+        elif param.get("type") == "object":
+            properties = param.get("properties", {})
+            nested_schema = "{\n"
+            for nested_param_name, nested_param in properties.items():
+                nested_param_type = generate_type_definition(
+                    nested_param, indent_level + 1, shared_defs
+                )
+                nested_schema += (
+                    f"{indent}  {nested_param_name}: {nested_param_type},\n"
+                )
+            nested_schema += indent + "}"
+            return nested_schema
+        elif "enum" in param:
+            # Enum type
+            return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
+        else:
+            # Simple type
+            return param.get("type", "any")
+
+    def generate_shared_definitions(shared_defs, indent_level: int) -> str:
+        indent = "  " * indent_level
+        shared_definitions = ""
+        for def_name, def_properties in shared_defs.items():
+            shared_definitions += f"{indent}type {def_name} = "
+            if def_properties.get("type") == "object":
+                shared_definitions += generate_type_definition(
+                    def_properties, indent_level, shared_defs
+                )
+            elif "enum" in def_properties:
+                # Enum type
+                shared_definitions += " | ".join(
+                    [f'"{enum_value}"' for enum_value in def_properties["enum"]]
+                )
+            shared_definitions += ";\n"
+        return shared_definitions
+
+    def generate_schema_from_functions(functions, namespace="functions") -> str:
+        schema = (
+            "// Supported function definitions that should be called when necessary.\n"
+        )
+        schema += f"namespace {namespace} {{\n\n"
+
+        # Generate shared definitions
+        shared_definitions = {}
+        for function in functions:
+            parameters = function.get("parameters", {})
+            shared_definitions.update(parameters.get("$defs", {}))
+
+        schema += generate_shared_definitions(shared_definitions, 1)
+
+        for function in functions:
+            function_name = function["name"]
+            description = function.get("description", "")
+            parameters = function.get("parameters", {})
+            required_params = parameters.get("required", [])
+
+            schema += f"  // {description}\n"
+            schema += f"  type {function_name} = (_: {{\n"
+
+            for param_name, param in parameters.get("properties", {}).items():
+                param_description = param.get("description", "")
+                param_type = generate_type_definition(param, 2, shared_definitions)
+                optional_indicator = "" if param_name in required_params else "?"
+                schema += f"    // {param_description}\n"
+                schema += f"    {param_name}{optional_indicator}: {param_type},\n"
+            schema += "  }) => any;\n\n"
+
+        schema += "}} // namespace {}\n".format(namespace)
+        return schema
+
+    def prepare_messages_for_inference(
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    ):
+        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+        if functions is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system", content=generate_schema_from_functions(functions)
+                )
+            )
+
+        if tools is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system",
+                    content=generate_schema_from_functions(
+                        [
+                            tool["function"]
+                            for tool in tools
+                            if tool["type"] == "function"
+                        ]
+                    ),
+                )
+            )
+
+        all_messages.append(
+            llama_types.ChatCompletionRequestSystemMessage(
+                role="system", content=SYSTEM_MESSAGE
+            )
+        )
+
+        for message in messages:
+            # Function call responses
+            if message["role"] == "function" and "name" in message:
+                message["name"] = f"functions.{message['name']}"
+            # Function call requests by assistant
+            if "function_call" in message:
+                message["function_call"][
+                    "name"
+                ] = f"functions.{message['function_call']['name']}"
+            all_messages.append(message)
+
+        all_messages.append(
+            llama_types.ChatCompletionRequestAssistantMessage(
+                role="assistant", content=None
+            )
+        )
+
+        def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
+            if msg["role"] == "system":
+                return f"system:\n{msg['content']}\n"
+
+            elif msg["role"] == "function" and "name" in msg:
+                return f"function name={msg['name']}:\n{msg['content']}\n"
+            elif msg["role"] == "function" and "function_call" in msg:
+                return f"function name={msg['function_call']['name']}:\n{msg['function_call']['arguments']}\n"
+            elif msg["role"] == "tool":
+                if msg["content"] is not None:
+                    return f"function name={msg['tool_call_id']}:\n{msg['content']}\n"
+                else:
+                    return f"function name={msg['tool_call_id']}\n"
+            elif msg["role"] == "user":
+                if msg["content"] is None:
+                    return "user:\n</s></s>\n"
+                else:
+                    return f"user:\n</s>{msg['content']}</s>\n"
+            elif msg["role"] == "assistant":
+                if msg["content"] is not None and "function_call" in msg:
+                    return f"assistant:\n{msg['content']}\nassistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
+                elif "function_call" in msg:
+                    return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
+                elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
+                    for tool_call in msg[
+                        "tool_calls"
+                    ]:  # NOTE: probably doesn't work with the functionary model
+                        return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
+                elif msg["content"] is None:
+                    return "assistant"
+                else:
+                    return f"assistant:\n{msg['content']}\n"
+            else:
+                raise ValueError(f"Unsupported role: {msg['role']}")
+
+        return "".join([message_to_str(msg) for msg in all_messages])
+
+    if tools is not None:
+        functions = [tool["function"] for tool in tools if tool["type"] == "function"]
+
+    if tool_choice is not None:
+        function_call = (
+            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
+        )
+
+    prompt = prepare_messages_for_inference(messages, functions, tools)
+
+    if function_call is None and (functions is None or len(functions) == 0):
+        completion_or_completion_chunks = llama.create_completion(
+            prompt=prompt + ":\n",
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=["user:", "</s>"],
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
+
+    if function_call is None or (
+        isinstance(function_call, str) and function_call == "auto"
+    ):
+        stop = "\n"
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=prompt, stop=stop, stream=False
+        )  # type: ignore
+        completion_text = completion["choices"][0]["text"]
+        # strip " to=functions." and ending ":"
+        function_call = completion_text.split(".")[-1][:-1]
+        new_prompt = prompt + completion_text + stop
+    elif isinstance(function_call, str) and function_call != "none":
+        new_prompt = prompt + f":\n"
+    elif isinstance(function_call, dict):
+        new_prompt = prompt + f" to=functions.{function_call['name']}:\n"
+        function_call = function_call["name"]
+    else:
+        new_prompt = prompt + f":\n"
+
+    function_body = None
+    for function in functions or []:
+        if function["name"] == function_call:
+            function_body = function["parameters"]
+            break
+    for tool in tools or []:
+        if tool["type"] == "function" and tool["function"]["name"] == function_call:
+            function_body = tool["function"]["parameters"]
+            break
+
+    if function_body is not None:
+        try:
+            with suppress_stdout_stderr(disable=llama.verbose):
+                grammar_text = llama_grammar.json_schema_to_gbnf(
+                    json.dumps(function_body)
+                )
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                )
+                print(grammar_text)
+        except Exception as e:
+            if llama.verbose:
+                print(
+                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                )
+                print(e)
+            with suppress_stdout_stderr(disable=llama.verbose):
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF
+                )
+    else:
+        with suppress_stdout_stderr(disable=llama.verbose):
+            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+
+    completion: llama_types.Completion = llama.create_completion(
+        prompt=new_prompt,
+        stop=["user:", "</s>"],
+        stream=False,
+        grammar=grammar,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        min_p=min_p,
+        typical_p=typical_p,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        repeat_penalty=repeat_penalty,
+        tfs_z=tfs_z,
+        mirostat_mode=mirostat_mode,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        model=model,
+        logits_processor=logits_processor,
+    )  # type: ignore
+
+    assert "usage" in completion
+    assert isinstance(function_call, str)
+    assert stream is False  # TODO: support stream mode
+
+    if llama.verbose:
+        print(new_prompt)
+        print(completion["choices"][0]["text"])
+
+    # TODO: support stream mode
+    return llama_types.CreateChatCompletionResponse(
+        id="chat" + completion["id"],
+        object="chat.completion",
+        created=completion["created"],
+        model=completion["model"],
+        choices=[
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": None,
+                    "function_call": {
+                        "name": function_call,
+                        "arguments": completion["choices"][0]["text"],
+                    },
+                    "tool_calls": [
+                        {
+                            "id": function_call,
+                            "type": "function",
+                            "function": {
+                                "name": function_call,
+                                "arguments": completion["choices"][0]["text"],
+                            },
+                        }
+                    ],
+                },
+                "finish_reason": "tool_calls",
+            }
+        ],
+        usage=completion["usage"],
+    )
+
+
+@register_chat_completion_handler("functionary-v1")
+@register_chat_completion_handler("functionary-v2")
+def functionary_v1_v2_chat_handler(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
+) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+    SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
     
     tokenizer = llama.tokenizer()
     assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"

From 8d08b2d3088db61589f6ffd0932c7a5f1c7101d0 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Tue, 23 Jan 2024 15:49:48 +0000
Subject: [PATCH 22/27] add hf_tokenizer_path in server ModelSettings

---
 llama_cpp/server/model.py    | 2 ++
 llama_cpp/server/settings.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index bbb68069d..5fd0f0861 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -119,6 +119,8 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             use_mmap=settings.use_mmap,
             use_mlock=settings.use_mlock,
             kv_overrides=kv_overrides,
+            # Tokenizer Params (optionally for Functionary function calling)
+            hf_tokenizer_path=settings.hf_tokenizer_path,
             # Context Params
             seed=settings.seed,
             n_ctx=settings.n_ctx,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 9fe1a7bfd..0cf1ad0a5 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -56,6 +56,11 @@ class ModelSettings(BaseSettings):
         default=None,
         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
     )
+    # Tokenizer Params
+    hf_tokenizer_path: Optional[str] = Field(
+        default=None,
+        description="Override llama.cpp tokenizer with HF AutoTokenizer from this path if provided.",
+    )
     # Context Params
     seed: int = Field(
         default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."

From 3657cba19c39072818770c8cfaff48689a5e3dc5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 31 Jan 2024 15:52:37 -0500
Subject: [PATCH 23/27] Cleanup PR, fix breaking changes

---
 llama_cpp/llama.py             | 96 ++++++++++++++++++++--------------
 llama_cpp/llama_chat_format.py |  2 +-
 llama_cpp/server/model.py      |  6 +++
 3 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7440c386b..f30b87d0f 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+import abc
 import uuid
 import time
 import multiprocessing
@@ -15,12 +16,13 @@
     Deque,
     Callable,
     Any,
-    Protocol,
 )
 from collections import deque
 
 import ctypes
 
+from llama_cpp.llama_types import List
+
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
 from .llama_cache import (
@@ -66,8 +68,6 @@ def __init__(
         use_mmap: bool = True,
         use_mlock: bool = False,
         kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
-        # Tokenizer Params (Optionally for HF AutoTokenizers)
-        hf_tokenizer_path: Optional[str] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
@@ -99,6 +99,8 @@ def __init__(
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Speculative Decoding
         draft_model: Optional[LlamaDraftModel] = None,
+        # Tokenizer Override
+        tokenizer: Optional[BaseLlamaTokenizer] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -140,7 +142,6 @@ def __init__(
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             kv_overrides: Key-value overrides for the model.
-            hf_tokenizer_path: Override llama.cpp tokenizer with HF AutoTokenizer from this path if provided.
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
@@ -164,6 +165,7 @@ def __init__(
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
             draft_model: Optional draft model to use for speculative decoding.
+            tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -291,13 +293,10 @@ def __init__(
         self._model = _LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
         )
-        
-        # Tokenizer Params
-        if hf_tokenizer_path is not None:
-            self._tokenizer_to_use = HFTokenizer(hf_tokenizer_path)
-        else:
-            self._tokenizer_to_use = LlamaCppTokenizer(self._model)
-        
+
+        # Override tokenizer
+        self.tokenizer_ = tokenizer or LlamaTokenizer(self)
+
         # Set the default value for the context and correct the batch
         if n_ctx == 0:
             n_ctx = self._model.n_ctx_train()
@@ -443,7 +442,7 @@ def tokenize(
         Returns:
             A list of tokens.
         """
-        return self._tokenizer_to_use.encode(text, add_bos, special)
+        return self.tokenizer_.tokenize(text, add_bos, special)
 
     def detokenize(self, tokens: List[int]) -> bytes:
         """Detokenize a list of tokens.
@@ -454,7 +453,7 @@ def detokenize(self, tokens: List[int]) -> bytes:
         Returns:
             The detokenized string.
         """
-        return self._tokenizer_to_use.decode(tokens)
+        return self.tokenizer_.detokenize(tokens)
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -1706,9 +1705,9 @@ def n_vocab(self) -> int:
         """Return the vocabulary size."""
         return self._model.n_vocab()
 
-    def tokenizer(self) -> Union["LlamaCppTokenizer", "HFTokenizer"]:
-        """Return the tokenizer for this model."""
-        return self._tokenizer_to_use
+    def tokenizer(self) -> LlamaTokenizer:
+        """Return the llama tokenizer for this model."""
+        return LlamaTokenizer(self)
 
     def token_eos(self) -> int:
         """Return the end-of-sequence token."""
@@ -1751,41 +1750,62 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         return longest_prefix
 
 
-class LlamaTokenizer(Protocol):
-    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
-        ...
-    
-    def decode(self, tokens: List[int]) -> bytes:
-        ...
+class BaseLlamaTokenizer(abc.ABC):
+    @abc.abstractmethod
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def detokenize(self, tokens: List[int]) -> bytes:
+        raise NotImplementedError
 
-class LlamaCppTokenizer:
+
+class LlamaTokenizer(BaseLlamaTokenizer):
     def __init__(self, llama: Llama):
         self.llama = llama
+        self._model = llama._model # type: ignore
+
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
+        return self._model.tokenize(text, add_bos=add_bos, special=special)
+
+    def detokenize(self, tokens: List[int]) -> bytes:
+        return self._model.detokenize(tokens)
 
-    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
-        return self.llama.tokenize(text, add_bos, special)
+    def encode(self, text: str, add_bos: bool = True, special: bool = True) -> List[int]:
+        return self.tokenize(
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
+        )
 
-    def decode(self, tokens: List[int]) -> bytes:
-        return self.llama.detokenize(tokens)
+    def decode(self, tokens: List[int]) -> str:
+        return self.detokenize(tokens).decode("utf-8", errors="ignore")
 
     @classmethod
     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
         return cls(Llama(model_path=path, vocab_only=True))
-    
 
-class HFTokenizer:
-    def __init__(self, hf_tokenizer_path):
-        from transformers import AutoTokenizer
-        self.hf_tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
-        
-    def encode(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
-        return self.hf_tokenizer.encode(
-            text.decode("utf-8", errors="ignore"), add_special_tokens=special
-        )
+
+class LlamaHFTokenizer(BaseLlamaTokenizer):
+    def __init__(self, hf_tokenizer: Any):
+        self.hf_tokenizer = hf_tokenizer
+
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
+        return self.hf_tokenizer.encode(text.decode("utf-8", errors="ignore"), add_special_tokens=special)
     
-    def decode(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int]) -> bytes:
         return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
+        try:
+            from transformers import AutoTokenizer
+        except ImportError:
+            raise ImportError(
+                "The `transformers` library is required to use the `HFTokenizer`."
+                "You can install it with `pip install transformers`."
+            )
+        hf_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
+        return cls(hf_tokenizer)
+
 
 class LlamaState:
     def __init__(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index b05babe56..2e4204121 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1366,7 +1366,7 @@ def functionary_v1_v2_chat_handler(
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
     
-    tokenizer = llama.tokenizer()
+    tokenizer = llama.tokenizer_
     assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
     from transformers import AutoTokenizer
     
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 59b96a75d..9ed17c266 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -92,6 +92,10 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                     json.load(open(settings.hf_tokenizer_config_path))
                 )
             )
+        
+        tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
+        if settings.hf_pretrained_model_name_or_path is not None:
+            tokenizer = llama_cpp.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
 
         draft_model = None
         if settings.draft_model is not None:
@@ -158,6 +162,8 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             chat_handler=chat_handler,
             # Speculative Decoding
             draft_model=draft_model,
+            # Tokenizer
+            tokenizer=tokenizer,
             # Misc
             verbose=settings.verbose,
         )

From a79743b15764541596982b3d90fbe16ae1b1057a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 31 Jan 2024 16:08:47 -0500
Subject: [PATCH 24/27] Use hf_pretrained_model_name_or_path for tokenizer

---
 llama_cpp/server/model.py    | 4 +---
 llama_cpp/server/settings.py | 5 -----
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 9ed17c266..6d8ec2467 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -92,7 +92,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                     json.load(open(settings.hf_tokenizer_config_path))
                 )
             )
-        
+
         tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
         if settings.hf_pretrained_model_name_or_path is not None:
             tokenizer = llama_cpp.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
@@ -130,8 +130,6 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             use_mmap=settings.use_mmap,
             use_mlock=settings.use_mlock,
             kv_overrides=kv_overrides,
-            # Tokenizer Params (optionally for Functionary function calling)
-            hf_tokenizer_path=settings.hf_tokenizer_path,
             # Context Params
             seed=settings.seed,
             n_ctx=settings.n_ctx,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 32e4acbb1..60f3eeca2 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -56,11 +56,6 @@ class ModelSettings(BaseSettings):
         default=None,
         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
     )
-    # Tokenizer Params
-    hf_tokenizer_path: Optional[str] = Field(
-        default=None,
-        description="Override llama.cpp tokenizer with HF AutoTokenizer from this path if provided.",
-    )
     # Context Params
     seed: int = Field(
         default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."

From 7b36eb3facb3ab161e22db2576405eb520180f4f Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 1 Feb 2024 08:15:03 +0000
Subject: [PATCH 25/27] fix hf tokenizer in streaming

---
 llama_cpp/llama.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index f30b87d0f..80e171173 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -946,7 +946,11 @@ def logit_bias_processor(
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_text = self.detokenize(remaining_tokens)
+                if isinstance(self.tokenizer_, LlamaHFTokenizer):
+                    prev_text = self.detokenize(completion_tokens[:returned_tokens])
+                    remaining_text = all_text[len(prev_text):]
+                else:
+                    remaining_text = self.detokenize(remaining_tokens)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -968,13 +972,17 @@ def logit_bias_processor(
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(self.detokenize([token]))
+                        if isinstance(self.tokenizer_, LlamaHFTokenizer):
+                            detokenized_token = remaining_text
+                        else:
+                            detokenized_token = self.detokenize([token])
+                        token_end_position += len(detokenized_token)
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = self.detokenize([token]).decode(
+                        token_str = detokenized_token.decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
@@ -999,11 +1007,7 @@ def logit_bias_processor(
                         }
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
-                            "tokens": [
-                                self.detokenize([token]).decode(
-                                    "utf-8", errors="ignore"
-                                )
-                            ],
+                            "tokens": [token_str],
                             "text_offset": [text_offset],
                             "token_logprobs": [current_logprobs[int(token)]],
                             "top_logprobs": [top_logprob],
@@ -1016,9 +1020,7 @@ def logit_bias_processor(
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": self.detokenize([token]).decode(
-                                        "utf-8", errors="ignore"
-                                    ),
+                                    "text": token_str,
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
                                     "finish_reason": None,
@@ -1030,7 +1032,10 @@ def logit_bias_processor(
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = self.detokenize(remaining_tokens[:i])
+                                if isinstance(self.tokenizer_, LlamaHFTokenizer):
+                                    bs = remaining_text
+                                else:
+                                    bs = self.detokenize(remaining_tokens[:i])
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break

From 5ea9b1919108f8190af897154f0bf3362132f510 Mon Sep 17 00:00:00 2001
From: jeffrey-fong <jeffreyfong94@gmail.com>
Date: Thu, 1 Feb 2024 16:44:16 +0800
Subject: [PATCH 26/27] update README

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5be62fd93..bddef6459 100644
--- a/README.md
+++ b/README.md
@@ -295,11 +295,12 @@ The high-level API also provides a simple interface for function calling.
 
 The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
 
-Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide the path to the HF tokenizer for functionary. They are already included in the respective HF repositories hosting the gguf files.
+Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
 
 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", hf_tokenizer_path="path/to/functionary-gguf/", chat_format="functionary-v2")
+>>> from llama_cpp import Llama, LlamaHFTokenizer
+>>> tokenizer = LlamaHFTokenizer.from_pretrained("path/to/functionary/")
+>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", tokenizer=tokenizer, chat_format="functionary-v2")
 >>> llm.create_chat_completion(
       messages = [
         {

From 24eb0dba3ba7b09012227fb119219d2aff87ccba Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Fri, 2 Feb 2024 03:40:54 +0000
Subject: [PATCH 27/27] refactor offset mapping

---
 llama_cpp/llama.py | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 80e171173..3e032e47e 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -444,16 +444,17 @@ def tokenize(
         """
         return self.tokenizer_.tokenize(text, add_bos, special)
 
-    def detokenize(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
         """Detokenize a list of tokens.
 
         Args:
             tokens: The list of tokens to detokenize.
+            prev_tokens: The list of previous tokens. Offset mapping will be performed if provided
 
         Returns:
             The detokenized string.
         """
-        return self.tokenizer_.detokenize(tokens)
+        return self.tokenizer_.detokenize(tokens, prev_tokens)
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -946,11 +947,8 @@ def logit_bias_processor(
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                if isinstance(self.tokenizer_, LlamaHFTokenizer):
-                    prev_text = self.detokenize(completion_tokens[:returned_tokens])
-                    remaining_text = all_text[len(prev_text):]
-                else:
-                    remaining_text = self.detokenize(remaining_tokens)
+                prev_tokens = completion_tokens[:returned_tokens]
+                remaining_text = self.detokenize(completion_tokens, prev_tokens)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -972,17 +970,13 @@ def logit_bias_processor(
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        if isinstance(self.tokenizer_, LlamaHFTokenizer):
-                            detokenized_token = remaining_text
-                        else:
-                            detokenized_token = self.detokenize([token])
-                        token_end_position += len(detokenized_token)
+                        token_end_position += len(remaining_text)
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = detokenized_token.decode(
+                        token_str = remaining_text.decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
@@ -1032,10 +1026,7 @@ def logit_bias_processor(
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                if isinstance(self.tokenizer_, LlamaHFTokenizer):
-                                    bs = remaining_text
-                                else:
-                                    bs = self.detokenize(remaining_tokens[:i])
+                                bs = remaining_text
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1761,7 +1752,7 @@ def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> L
         raise NotImplementedError
 
     @abc.abstractmethod
-    def detokenize(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
         raise NotImplementedError
 
 
@@ -1773,8 +1764,11 @@ def __init__(self, llama: Llama):
     def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
         return self._model.tokenize(text, add_bos=add_bos, special=special)
 
-    def detokenize(self, tokens: List[int]) -> bytes:
-        return self._model.detokenize(tokens)
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
+        if prev_tokens is not None:
+            return self._model.detokenize(tokens[len(prev_tokens):])
+        else:
+            return self._model.detokenize(tokens)
 
     def encode(self, text: str, add_bos: bool = True, special: bool = True) -> List[int]:
         return self.tokenize(
@@ -1796,8 +1790,13 @@ def __init__(self, hf_tokenizer: Any):
     def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
         return self.hf_tokenizer.encode(text.decode("utf-8", errors="ignore"), add_special_tokens=special)
     
-    def detokenize(self, tokens: List[int]) -> bytes:
-        return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
+        if prev_tokens is not None:
+            text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+            prev_text = self.hf_tokenizer.decode(prev_tokens).encode("utf-8", errors="ignore")
+            return text[len(prev_text):]
+        else:
+            return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":