Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit dcb8d44

Browse filesBrowse files
cebtenzzrehazelnutcloud
authored andcommitted
convert : automatically fall back to HfVocab if tokenizer.model doesn't exist (ggml-org#5821)
1 parent 5fd9d9e commit dcb8d44
Copy full SHA for dcb8d44

File tree

Expand file treeCollapse file tree

4 files changed

+43
-47
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+43
-47
lines changed

‎README.md

Copy file name to clipboardExpand all lines: README.md
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ And after 4.45 hours, you will have the final perplexity.
786786
### Interactive mode
787787
788788
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
789-
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
789+
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
790790
791791
Here is an example of a few-shot interaction, invoked with the command
792792
@@ -850,7 +850,7 @@ Sample run:
850850
```
851851
== Running in interactive mode. ==
852852
- Press Ctrl+C to interject at any time.
853-
- Press Return to return control to LLaMa.
853+
- Press Return to return control to LLaMA.
854854
- If you want to submit another line, end your input in '\'.
855855
856856
Below is an instruction that describes a task. Write a response that appropriately completes the request.

‎convert-llama-ggml-to-gguf.py

Copy file name to clipboardExpand all lines: convert-llama-ggml-to-gguf.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
373373
raise ValueError('Unable to load metadata')
374374
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
375375
vocab_factory = convert.VocabFactory(vocab_path)
376-
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
376+
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
377377
convert.check_vocab_size(params, vocab)
378378
return params, vocab, special_vocab
379379

@@ -398,8 +398,8 @@ def handle_args():
398398
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
399399
parser.add_argument("--vocab-dir", type=Path,
400400
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
401-
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
402-
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
401+
parser.add_argument("--vocabtype", default="spm,hfft",
402+
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
403403
return parser.parse_args()
404404

405405

‎convert.py

Copy file name to clipboardExpand all lines: convert.py
+36-40Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus:
12821282

12831283

12841284
class VocabFactory:
1285+
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
1286+
12851287
def __init__(self, path: Path):
12861288
self.path = path
1287-
self.files: dict[str, Path | None] = {
1288-
"tokenizer.model": None,
1289-
"vocab.json": None,
1290-
"tokenizer.json": None,
1291-
}
1292-
self._detect_files()
1293-
1294-
def _detect_files(self):
1295-
for file in self.files.keys():
1296-
file_path = self.path / file
1297-
parent_file_path = self.path.parent / file
1298-
if file_path.exists():
1299-
self.files[file] = file_path
1300-
elif parent_file_path.exists():
1301-
self.files[file] = parent_file_path
1302-
print(f"Found vocab files: {self.files}")
1303-
1304-
def _select_file(self, vocabtype: str | None) -> Path:
1305-
if vocabtype in ["spm", "bpe"]:
1306-
for file_key in self.files.keys():
1307-
if (file := self.files[file_key]) is not None:
1308-
return file
1309-
raise FileNotFoundError(f"{vocabtype} vocab not found.")
1310-
if vocabtype == "hfft":
1311-
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
1312-
return self.path
1313-
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1289+
self.file_paths = self._detect_files()
1290+
print(f"Found vocab files: {self.file_paths}")
1291+
1292+
def _detect_files(self) -> dict[str, Path | None]:
1293+
def locate(file: str) -> Path | None:
1294+
if (path := self.path / file).exists():
1295+
return path
1296+
if (path := self.path.parent / file).exists():
1297+
return path
1298+
return None
1299+
1300+
return {vt: locate(f) for vt, f in self._FILES.items()}
1301+
1302+
def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
1303+
for vtype in vocab_types:
1304+
try:
1305+
path = self.file_paths[vtype]
1306+
except KeyError:
1307+
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
1308+
if path is not None:
1309+
return vtype, path
1310+
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
13141311

13151312
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
13161313
load_merges = vocabtype == "bpe"
@@ -1322,30 +1319,30 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path:
13221319
n_vocab=n_vocab,
13231320
)
13241321

1325-
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1326-
path = self._select_file(vocabtype)
1327-
print(f"Loading vocab file '{path}', type '{vocabtype}'")
1322+
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1323+
vocab_type, path = self._select_file(vocab_types)
1324+
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
13281325

13291326
added_tokens_path = path.parent / "added_tokens.json"
13301327
vocab: Vocab
1331-
if vocabtype == "bpe":
1328+
if vocab_type == "bpe":
13321329
vocab = BpeVocab(
13331330
path, added_tokens_path if added_tokens_path.exists() else None
13341331
)
1335-
elif vocabtype == "spm":
1332+
elif vocab_type == "spm":
13361333
vocab = SentencePieceVocab(
13371334
path, added_tokens_path if added_tokens_path.exists() else None
13381335
)
1339-
elif vocabtype == "hfft":
1336+
elif vocab_type == "hfft":
13401337
vocab = HfVocab(
1341-
path, added_tokens_path if added_tokens_path.exists() else None
1338+
path.parent, added_tokens_path if added_tokens_path.exists() else None
13421339
)
13431340
else:
1344-
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1341+
raise ValueError(vocab_type)
13451342
# FIXME: Respect --vocab-dir?
13461343
special_vocab = self._create_special_vocab(
13471344
vocab,
1348-
vocabtype,
1345+
vocab_type,
13491346
model_parent_path,
13501347
)
13511348
return vocab, special_vocab
@@ -1379,15 +1376,14 @@ def main(args_in: list[str] | None = None) -> None:
13791376
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
13801377
# We currently only support Q8_0 output on little endian systems.
13811378
output_choices.append("q8_0")
1382-
vocab_types = ["spm", "bpe", "hfft"]
1383-
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
1379+
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
13841380
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
13851381
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
13861382
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
13871383
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
13881384
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
13891385
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1390-
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
1386+
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
13911387
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
13921388
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
13931389
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
@@ -1448,7 +1444,7 @@ def main(args_in: list[str] | None = None) -> None:
14481444
model_parent_path = model_plus.paths[0].parent
14491445
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
14501446
vocab_factory = VocabFactory(vocab_path)
1451-
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
1447+
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
14521448

14531449
if args.vocab_only:
14541450
if not args.outfile:

‎examples/infill/infill.cpp

Copy file name to clipboardExpand all lines: examples/infill/infill.cpp
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
378378
if (params.interactive) {
379379
const char *control_message;
380380
if (params.multiline_input) {
381-
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
381+
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
382382
" - To return control without starting a new line, end your input with '/'.\n";
383383
} else {
384-
control_message = " - Press Return to return control to LLaMa.\n"
384+
control_message = " - Press Return to return control to LLaMA.\n"
385385
" - To return control without starting a new line, end your input with '/'.\n"
386386
" - If you want to submit another line, end your input with '\\'.\n";
387387
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.