6
6
import ctypes
7
7
import dataclasses
8
8
import random
9
+ import pathlib
9
10
import string
10
11
11
12
from contextlib import ExitStack
24
25
25
26
import jinja2
26
27
from jinja2 .sandbox import ImmutableSandboxedEnvironment
28
+ import filelock
27
29
28
30
import numpy as np
29
31
import numpy .typing as npt
@@ -279,11 +281,15 @@ def _convert_text_completion_logprobs_to_chat(
279
281
}
280
282
for top_token , top_logprob in top_logprobs .items ()
281
283
],
282
- } for (token , logprob , top_logprobs ) in zip (logprobs ["tokens" ], logprobs ["token_logprobs" ], logprobs ["top_logprobs" ])
284
+ }
285
+ for (token , logprob , top_logprobs ) in zip (
286
+ logprobs ["tokens" ], logprobs ["token_logprobs" ], logprobs ["top_logprobs" ]
287
+ )
283
288
],
284
289
"refusal" : None ,
285
290
}
286
291
292
+
287
293
def _convert_text_completion_to_chat (
288
294
completion : llama_types .Completion ,
289
295
) -> llama_types .ChatCompletion :
@@ -300,7 +306,9 @@ def _convert_text_completion_to_chat(
300
306
"role" : "assistant" ,
301
307
"content" : completion ["choices" ][0 ]["text" ],
302
308
},
303
- "logprobs" : _convert_text_completion_logprobs_to_chat (completion ["choices" ][0 ]["logprobs" ]),
309
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
310
+ completion ["choices" ][0 ]["logprobs" ]
311
+ ),
304
312
"finish_reason" : completion ["choices" ][0 ]["finish_reason" ],
305
313
}
306
314
],
@@ -344,7 +352,9 @@ def _convert_text_completion_chunks_to_chat(
344
352
if chunk ["choices" ][0 ]["finish_reason" ] is None
345
353
else {}
346
354
),
347
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
355
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
356
+ chunk ["choices" ][0 ]["logprobs" ]
357
+ ),
348
358
"finish_reason" : chunk ["choices" ][0 ]["finish_reason" ],
349
359
}
350
360
],
@@ -407,7 +417,9 @@ def _convert_completion_to_chat_function(
407
417
}
408
418
],
409
419
},
410
- "logprobs" : _convert_text_completion_logprobs_to_chat (completion ["choices" ][0 ]["logprobs" ]),
420
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
421
+ completion ["choices" ][0 ]["logprobs" ]
422
+ ),
411
423
"finish_reason" : "tool_calls" ,
412
424
}
413
425
],
@@ -460,7 +472,9 @@ def _stream_response_to_function_stream(
460
472
{
461
473
"index" : 0 ,
462
474
"finish_reason" : None ,
463
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
475
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
476
+ chunk ["choices" ][0 ]["logprobs" ]
477
+ ),
464
478
"delta" : {
465
479
"role" : None ,
466
480
"content" : None ,
@@ -497,7 +511,9 @@ def _stream_response_to_function_stream(
497
511
{
498
512
"index" : 0 ,
499
513
"finish_reason" : None ,
500
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
514
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
515
+ chunk ["choices" ][0 ]["logprobs" ]
516
+ ),
501
517
"delta" : {
502
518
"role" : None ,
503
519
"content" : None ,
@@ -598,6 +614,19 @@ def chat_completion_handler(
598
614
add_bos = not result .added_special ,
599
615
special = True ,
600
616
)
617
+
618
+ # Is there a way to ensure this is not set for production? This will
619
+ # slow down things at least a little (latency) because I/O is slow.
620
+ if llama .formatted_prompt_path is not None :
621
+ output_path = pathlib .Path (llama .formatted_prompt_path )
622
+
623
+ # We ensure that output path ends with .ndjson in pydantic validation.
624
+ lockfile_path = output_path .with_suffix (".lock" )
625
+ with filelock .FileLock (str (lockfile_path )):
626
+ with output_path .open ("a" , encoding = "utf-8" ) as f :
627
+ json .dump ({"prompt" : result .prompt , "prompt_tokens" : prompt }, f )
628
+ f .write ("\n " )
629
+
601
630
if result .stop is not None :
602
631
stop = [] if stop is None else [stop ] if isinstance (stop , str ) else stop
603
632
rstop = result .stop if isinstance (result .stop , list ) else [result .stop ]
@@ -695,7 +724,7 @@ def chat_completion_handler(
695
724
696
725
697
726
def hf_autotokenizer_to_chat_formatter (
698
- pretrained_model_name_or_path : Union [str , os .PathLike [str ]]
727
+ pretrained_model_name_or_path : Union [str , os .PathLike [str ]],
699
728
) -> ChatFormatter :
700
729
# https://huggingface.co/docs/transformers/main/chat_templating
701
730
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
@@ -720,7 +749,7 @@ def format_autotokenizer(
720
749
721
750
722
751
def hf_autotokenizer_to_chat_completion_handler (
723
- pretrained_model_name_or_path : Union [str , os .PathLike [str ]]
752
+ pretrained_model_name_or_path : Union [str , os .PathLike [str ]],
724
753
) -> LlamaChatCompletionHandler :
725
754
chat_formatter = hf_autotokenizer_to_chat_formatter (pretrained_model_name_or_path )
726
755
return chat_formatter_to_chat_completion_handler (chat_formatter )
@@ -1790,7 +1819,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
1790
1819
}
1791
1820
],
1792
1821
},
1793
- "logprobs" : _convert_text_completion_logprobs_to_chat (completion ["choices" ][0 ]["logprobs" ]),
1822
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
1823
+ completion ["choices" ][0 ]["logprobs" ]
1824
+ ),
1794
1825
"finish_reason" : "tool_calls" ,
1795
1826
}
1796
1827
],
@@ -2202,7 +2233,9 @@ def generate_streaming(tools, functions, function_call, prompt):
2202
2233
choices = [
2203
2234
{
2204
2235
"index" : 0 ,
2205
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
2236
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
2237
+ chunk ["choices" ][0 ]["logprobs" ]
2238
+ ),
2206
2239
"delta" : {
2207
2240
"role" : None ,
2208
2241
"content" : None ,
@@ -2304,7 +2337,9 @@ def generate_streaming(tools, functions, function_call, prompt):
2304
2337
choices = [
2305
2338
{
2306
2339
"index" : 0 ,
2307
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
2340
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
2341
+ chunk ["choices" ][0 ]["logprobs" ]
2342
+ ),
2308
2343
"delta" : {
2309
2344
"role" : "assistant" ,
2310
2345
"content" : None ,
@@ -2342,7 +2377,9 @@ def generate_streaming(tools, functions, function_call, prompt):
2342
2377
choices = [
2343
2378
{
2344
2379
"index" : 0 ,
2345
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
2380
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
2381
+ chunk ["choices" ][0 ]["logprobs" ]
2382
+ ),
2346
2383
"delta" : {
2347
2384
"role" : "assistant" ,
2348
2385
"content" : buffer .pop (0 ),
@@ -2365,7 +2402,9 @@ def generate_streaming(tools, functions, function_call, prompt):
2365
2402
choices = [
2366
2403
{
2367
2404
"index" : 0 ,
2368
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
2405
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
2406
+ chunk ["choices" ][0 ]["logprobs" ]
2407
+ ),
2369
2408
"delta" : {
2370
2409
"role" : "assistant" ,
2371
2410
"content" : (
@@ -2451,7 +2490,9 @@ def generate_streaming(tools, functions, function_call, prompt):
2451
2490
choices = [
2452
2491
{
2453
2492
"index" : 0 ,
2454
- "logprobs" : _convert_text_completion_logprobs_to_chat (chunk ["choices" ][0 ]["logprobs" ]),
2493
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
2494
+ chunk ["choices" ][0 ]["logprobs" ]
2495
+ ),
2455
2496
"delta" : {
2456
2497
"role" : None ,
2457
2498
"content" : None ,
@@ -2685,7 +2726,9 @@ def generate_streaming(tools, functions, function_call, prompt):
2685
2726
choices = [
2686
2727
{
2687
2728
"index" : 0 ,
2688
- "logprobs" : _convert_text_completion_logprobs_to_chat (completion ["choices" ][0 ]["logprobs" ]),
2729
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
2730
+ completion ["choices" ][0 ]["logprobs" ]
2731
+ ),
2689
2732
"message" : {
2690
2733
"role" : "assistant" ,
2691
2734
"content" : None if content == "" else content ,
@@ -2795,9 +2838,7 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
2795
2838
embed = self ._llava_cpp .llava_image_embed_make_with_bytes (
2796
2839
self .clip_ctx ,
2797
2840
n_threads_batch ,
2798
- (ctypes .c_uint8 * len (image_bytes )).from_buffer (
2799
- bytearray (image_bytes )
2800
- ),
2841
+ (ctypes .c_uint8 * len (image_bytes )).from_buffer (bytearray (image_bytes )),
2801
2842
len (image_bytes ),
2802
2843
)
2803
2844
self ._last_image_embed = embed
@@ -2869,7 +2910,6 @@ def __call__(
2869
2910
if self .verbose :
2870
2911
print (text , file = sys .stderr )
2871
2912
2872
-
2873
2913
# Evaluate prompt
2874
2914
llama .reset ()
2875
2915
llama ._ctx .kv_cache_clear ()
@@ -2885,7 +2925,9 @@ def __call__(
2885
2925
llama .eval (tokens )
2886
2926
else :
2887
2927
image_bytes = self .load_image (value )
2888
- embed = self ._embed_image_bytes (image_bytes , llama .context_params .n_threads_batch )
2928
+ embed = self ._embed_image_bytes (
2929
+ image_bytes , llama .context_params .n_threads_batch
2930
+ )
2889
2931
if llama .n_tokens + embed .contents .n_image_pos > llama .n_ctx ():
2890
2932
raise ValueError (
2891
2933
f"Prompt exceeds n_ctx: { llama .n_tokens + embed .contents .n_image_pos } > { llama .n_ctx ()} "
@@ -3404,7 +3446,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
3404
3446
"{% endif %}"
3405
3447
"{% endif %}"
3406
3448
"{% endfor %}"
3407
-
3408
3449
"{% for content in message['content'] %}"
3409
3450
"{% if content.type == 'text' %}"
3410
3451
"{{ content.text }}"
@@ -3817,7 +3858,9 @@ def chatml_function_calling(
3817
3858
{
3818
3859
"finish_reason" : "tool_calls" ,
3819
3860
"index" : 0 ,
3820
- "logprobs" : _convert_text_completion_logprobs_to_chat (completion ["choices" ][0 ]["logprobs" ]),
3861
+ "logprobs" : _convert_text_completion_logprobs_to_chat (
3862
+ completion ["choices" ][0 ]["logprobs" ]
3863
+ ),
3821
3864
"message" : {
3822
3865
"role" : "assistant" ,
3823
3866
"content" : None ,
0 commit comments