Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit c69c630

Browse filesBrowse files
authored
convert_hf : fix Gemma v1 conversion (#8597)
* convert_hf : fix Gemma v1 conversion * convert_hf : allow renaming tokens, but with a warning * convert_hf : fix Gemma v1 not setting BOS and EOS tokens
1 parent 69c487f commit c69c630
Copy full SHA for c69c630

File tree

Expand file treeCollapse file tree

1 file changed

+12
-5
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+12
-5
lines changed

‎convert_hf_to_gguf.py

Copy file name to clipboardExpand all lines: convert_hf_to_gguf.py
+12-5Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,8 @@ def _create_vocab_sentencepiece(self):
753753
token_id = int(token_id)
754754
token: str = token_data["content"]
755755
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
756-
assert tokens[token_id] == token.encode("utf-8")
756+
if tokens[token_id] != token.encode("utf-8"):
757+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
757758
if token_data.get("special") or self.does_token_look_special(token):
758759
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
759760
else:
@@ -1312,6 +1313,7 @@ def set_vocab(self):
13121313
special_vocab._set_special_token("prefix", 1)
13131314
special_vocab._set_special_token("suffix", 3)
13141315
special_vocab._set_special_token("middle", 2)
1316+
special_vocab.chat_template = None # do not add it twice
13151317
special_vocab.add_to_gguf(self.gguf_writer)
13161318

13171319
def set_gguf_parameters(self):
@@ -2014,7 +2016,8 @@ def set_vocab(self):
20142016
token_id = int(token_id)
20152017
token = foken_data["content"].encode("utf-8")
20162018
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2017-
assert tokens[token_id] == token
2019+
if tokens[token_id] != token:
2020+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
20182021
tokens[token_id] = token
20192022
scores[token_id] = -1000.0
20202023
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2030,7 +2033,8 @@ def set_vocab(self):
20302033
token_id = int(foken_data["id"])
20312034
token = foken_data["content"].encode("utf-8")
20322035
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2033-
assert tokens[token_id] == token
2036+
if tokens[token_id] != token:
2037+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
20342038
tokens[token_id] = token
20352039
scores[token_id] = -1000.0
20362040
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2269,7 +2273,8 @@ def set_vocab(self):
22692273
chat_eos_token_id = token_id
22702274
token = token.encode("utf-8")
22712275
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2272-
assert(tokens[token_id] == token)
2276+
if tokens[token_id] != token:
2277+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
22732278
tokens[token_id] = token
22742279
scores[token_id] = -1000.0
22752280
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2288,7 +2293,8 @@ def set_vocab(self):
22882293
chat_eos_token_id = token_id
22892294
token = token.encode("utf-8")
22902295
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2291-
assert(tokens[token_id] == token)
2296+
if tokens[token_id] != token:
2297+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
22922298
tokens[token_id] = token
22932299
scores[token_id] = -1000.0
22942300
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2474,6 +2480,7 @@ def set_vocab(self):
24742480
special_vocab._set_special_token("middle", 68)
24752481
special_vocab._set_special_token("fsep", 70)
24762482
special_vocab._set_special_token("eot", 107)
2483+
special_vocab.chat_template = None # do not add it twice
24772484
special_vocab.add_to_gguf(self.gguf_writer)
24782485

24792486
self.gguf_writer.add_add_space_prefix(False)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.