bois1616
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+35-11Lines changed: 35 additions & 11 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+35-11Lines changed: 35 additions & 11 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+10-43Lines changed: 10 additions & 43 deletions b/‎llama_cpp/llama_cpp.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_cpp.py
+10-43Lines changed: 10 additions & 43 deletions
diff --git a/‎vendor/llama.cpp
Copy file name to clipboard b/‎vendor/llama.cpp
Copy file name to clipboard
@@ -73,13 +73,16 @@ def _map_roles(
 
 
 def _format_llama2(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str
 ) -> str:
     """Format the prompt with the llama2 style."""
+    seps = [sep, sep2]
     ret = system_message + sep
-    for role, message in messages:
-        if message:
-            ret += role + message + " "
+    for i, (role, message) in enumerate(messages):
+        if system_message and i == 0:
+            ret += message + seps[i % 2]
+        elif message:
+            ret += role + message + " " + seps[i % 2]
         else:
             ret += role + " "
     return ret
@@ -324,19 +327,20 @@ def get_chat_format(name: str):
         )
 
 
+# see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/tokenization_llama.py
+# system prompt is "embedded" in the first message
 @register_chat_format("llama-2")
 def format_llama2(
     messages: List[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
-    _system_template = "[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n"
-    _roles = dict(user="[INST]", assistant="[/INST]")
-    _sep = "\n\n"
-    system_message = _get_system_message(messages)
-    system_message = _system_template.format(system_message=system_message)
+    _system_template = "<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>"
+    _roles = dict(user="<s>[INST]", assistant="[/INST]")
     _messages = _map_roles(messages, _roles)
-    _messages.append((_roles["assistant"], None))
-    _prompt = _format_llama2(system_message, _messages, _sep)
+    system_message = _get_system_message(messages)
+    if system_message:
+        system_message = _system_template.format(system_message=system_message)
+    _prompt = _format_llama2(system_message, _messages, " ", "</s>") + "[/INST]"
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -506,6 +510,26 @@ def format_chatml(
     _prompt = _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt)
 
+# eg, export HF_MODEL=mistralai/Mistral-7B-Instruct-v0.1
+@register_chat_format("autotokenizer")
+def format_autotokenizer(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    # https://huggingface.co/docs/transformers/main/chat_templating
+    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
+    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json
+    import os
+    from transformers import AutoTokenizer
+    huggingFaceModel = os.getenv("HF_MODEL") # eg, mistralai/Mistral-7B-Instruct-v0.1
+    print(huggingFaceModel)
+    if not huggingFaceModel:
+        raise Exception("HF_MODEL needs to be set in env to use chat format 'autotokenizer'")
+    tokenizer = AutoTokenizer.from_pretrained(huggingFaceModel)
+    tokenizer.use_default_system_prompt = False
+    _prompt = tokenizer.apply_chat_template(messages, tokenize=False)
+    # Return formatted prompt and eos token by default
+    return ChatFormatterResponse(prompt=_prompt, stop=tokenizer.eos_token)
 
 @register_chat_completion_handler("functionary")
 def functionary_chat_handler(
 
@@ -827,7 +827,7 @@ def llama_kv_cache_clear(ctx: llama_context_p):
 #                    llama_pos   p1);
 def llama_kv_cache_seq_rm(
     ctx: llama_context_p,
-    seq_id: llama_seq_id,
+    seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
 ):
@@ -855,8 +855,8 @@ def llama_kv_cache_seq_rm(
 #                    llama_pos   p1);
 def llama_kv_cache_seq_cp(
     ctx: llama_context_p,
-    seq_id_src: llama_seq_id,
-    seq_id_dst: llama_seq_id,
+    seq_id_src: Union[llama_seq_id, int],
+    seq_id_dst: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
 ):
@@ -879,7 +879,7 @@ def llama_kv_cache_seq_cp(
 #                 llama_seq_id   seq_id);
 def llama_kv_cache_seq_keep(
     ctx: llama_context_p,
-    seq_id: llama_seq_id,
+    seq_id: Union[llama_seq_id, int],
 ):
     return _lib.llama_kv_cache_seq_keep(ctx, seq_id)
 
@@ -900,7 +900,7 @@ def llama_kv_cache_seq_keep(
 #                    llama_pos   delta);
 def llama_kv_cache_seq_shift(
     ctx: llama_context_p,
-    seq_id: llama_seq_id,
+    seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
     delta: Union[llama_pos, int],
@@ -1204,7 +1204,7 @@ def llama_get_embeddings(
 
 
 # LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
-def llama_token_get_text(model: llama_model_p, token: llama_token) -> bytes:
+def llama_token_get_text(model: llama_model_p, token: Union[llama_token, int]) -> bytes:
     return _lib.llama_token_get_text(model, token)
 
 
@@ -1213,7 +1213,7 @@ def llama_token_get_text(model: llama_model_p, token: llama_token) -> bytes:
 
 
 # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-def llama_token_get_score(model: llama_model_p, token: llama_token) -> float:
+def llama_token_get_score(model: llama_model_p, token: Union[llama_token, int]) -> float:
     return _lib.llama_token_get_score(model, token)
 
 
@@ -1222,7 +1222,7 @@ def llama_token_get_score(model: llama_model_p, token: llama_token) -> float:
 
 
 # LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
-def llama_token_get_type(model: llama_model_p, token: llama_token) -> int:
+def llama_token_get_type(model: llama_model_p, token: Union[llama_token, int]) -> int:
     return _lib.llama_token_get_type(model, token)
 
 
@@ -1302,39 +1302,6 @@ def llama_token_eot(model: llama_model_p) -> int:
 # //
 
 
-# // Convert the provided text into tokens.
-# // The tokens pointer must be large enough to hold the resulting tokens.
-# // Returns the number of tokens on success, no more than n_max_tokens
-# // Returns a negative number on failure - the number of tokens that would have been returned
-# LLAMA_API int llama_tokenize(
-#     const struct llama_model * model,
-#                   const char * text,
-#                          int   text_len,
-#                  llama_token * tokens,
-#                          int   n_max_tokens,
-#                         bool   add_bos);
-def llama_tokenize(
-    model: llama_model_p,
-    text: bytes,
-    text_len: Union[c_int, int],
-    tokens,  # type: Array[llama_token]
-    n_max_tokens: Union[c_int, int],
-    add_bos: Union[c_bool, bool],
-) -> int:
-    return _lib.llama_tokenize(model, text, text_len, tokens, n_max_tokens, add_bos)
-
-
-_lib.llama_tokenize.argtypes = [
-    llama_model_p,
-    c_char_p,
-    c_int,
-    llama_token_p,
-    c_int,
-    c_bool,
-]
-_lib.llama_tokenize.restype = c_int
-
-
 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 # /// @return Returns the number of tokens on success, no more than n_max_tokens
@@ -1386,7 +1353,7 @@ def llama_tokenize(
 #                               int    length);
 def llama_token_to_piece(
     model: llama_model_p,
-    token: llama_token,
+    token: Union[llama_token, int],
     buf: Union[c_char_p, bytes],
     length: Union[c_int, int],
 ) -> int:
@@ -1835,7 +1802,7 @@ def llama_sample_token(
 def llama_grammar_accept_token(
     ctx: llama_context_p,
     grammar: llama_grammar_p,
-    token: llama_token,
+    token: Union[llama_token, int],
 ) -> None:
     _lib.llama_grammar_accept_token(ctx, grammar, token)