jeffrey-fong
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+29-32Lines changed: 29 additions & 32 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+29-32Lines changed: 29 additions & 32 deletions
@@ -2073,12 +2073,12 @@ def generate_streaming(tools, functions, function_call, prompt):
                         ],
                     )
             # Yield tool_call/function_call stop message
-            yield {
-                "id": "chat" + chunk["id"],
-                "object": "chat.completion.chunk",
-                "created": chunk["created"],
-                "model": chunk["model"],
-                "choices": [
+            yield llama_types.CreateChatCompletionStreamResponse(
+                id="chat" + chunk["id"],
+                object="chat.completion.chunk",
+                created=chunk["created"],
+                model=chunk["model"],
+                choices=[
                     {
                         "index": 0,
                         "finish_reason": "tool_calls" if tools is not None else "function_call",
@@ -2088,7 +2088,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         },
                     }
                 ],
-            }
+            )
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
             tool_index = 0
@@ -2108,20 +2108,20 @@ def generate_streaming(tools, functions, function_call, prompt):
                 if function_name == "all":
                     prompt += "all\n<|content|>"
                     # Yield the first empty message for content
-                    yield {
-                        "id": "chat" + chunk_id,
-                        "model": chunk["model"],
-                        "created": chunk_created,
-                        "object": "chat.completion.chunk",
-                        "choices": [
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk_id,
+                        model=chunk["model"],
+                        created=chunk_created,
+                        object="chat.completion.chunk",
+                        choices=[
                             {
                                 "index": 0,
                                 "delta": {"role": "assistant", "content": ""},
                                 "logprobs": None,
                                 "finish_reason": None,
                             }
                         ],
-                    }
+                    )
                 else:
                     prompt += f"{function_name}\n<|content|>"
                     grammar = get_grammar(function_name)
@@ -2221,20 +2221,20 @@ def generate_streaming(tools, functions, function_call, prompt):
                         prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
                     else:
                         # Yield stop message
-                        yield {
-                            "id": "chat" + chunk_id,
-                            "model": chunk["model"],
-                            "created": chunk_created,
-                            "object": "chat.completion.chunk",
-                            "choices": [
+                        yield llama_types.CreateChatCompletionStreamResponse(
+                            id="chat" + chunk_id,
+                            model=chunk["model"],
+                            created=chunk_created,
+                            object="chat.completion.chunk",
+                            choices=[
                                 {
                                     "index": 0,
                                     "delta": {},
                                     "logprobs": None,
                                     "finish_reason": "stop",
                                 }
                             ],
-                        }
+                        )
                         break
                 else:
                     # Check whether the model wants to generate another turn
@@ -2284,25 +2284,22 @@ def generate_streaming(tools, functions, function_call, prompt):
                         tool_index += 1
                     else:
                         # Yield tool_call/function_call stop message
-                        yield {
-                            "id": "chat" + chunk_id,
-                            "object": "chat.completion.chunk",
-                            "created": chunk_created,
-                            "model": chunk["model"],
-                            "choices": [
+                        yield llama_types.CreateChatCompletionStreamResponse(
+                            id="chat" + chunk_id,
+                            object="chat.completion.chunk",
+                            created=chunk_created,
+                            model=chunk["model"],
+                            choices=[
                                 {
                                     "index": 0,
                                     "finish_reason": "tool_calls" if tools is not None else "function_call",
                                     "logprobs": None,
                                     "delta": {
-                                        "role": None,
-                                        "content": None,
-                                        "function_call": None,
-                                        "tool_calls": None,
+                                        "role": None, "content": None, "function_call": None, "tool_calls": None
                                     },
                                 }
                             ],
-                        }
+                        )
                         break
 
     if stream is not False: