jeffrey-fong
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+44-1Lines changed: 44 additions & 1 deletion b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+44-1Lines changed: 44 additions & 1 deletion
@@ -2011,7 +2011,33 @@ def generate_streaming(tools, functions, function_call, prompt):
             tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
             completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
             completion_text = ""
+            first = True
             for chunk in completion:
+                # Yield the tool/function name first
+                if first:
+                    if tools is not None:
+                        func_call_dict = {
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": "call_" + tool_id,
+                                    "type": "function",
+                                    "function": {"name": function_call["name"], "arguments": ""},
+                                }
+                            ]
+                        }
+                    else:
+                        func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk["id"],
+                        object="chat.completion.chunk",
+                        created=chunk["created"],
+                        model=chunk["model"],
+                        choices=[
+                            {"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
+                        ],
+                    )
+                    first = False
                 if tools is not None:
                     func_call_dict = {
                         "tool_calls": [
@@ -2046,6 +2072,23 @@ def generate_streaming(tools, functions, function_call, prompt):
                             }
                         ],
                     )
+            # Yield tool_call/function_call stop message
+            yield {
+                "id": "chat" + chunk["id"],
+                "object": "chat.completion.chunk",
+                "created": chunk["created"],
+                "model": chunk["model"],
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": "tool_calls" if tools is not None else "function_call",
+                        "logprobs": None,
+                        "delta": {
+                            "role": None, "content": None, "function_call": None, "tool_calls": None
+                        },
+                    }
+                ],
+            }
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
             tool_index = 0
@@ -2240,7 +2283,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         prompt += "\n<|from|>assistant\n<|recipient|>"
                         tool_index += 1
                     else:
-                        # Yield tool_call stop message
+                        # Yield tool_call/function_call stop message
                         yield {
                             "id": "chat" + chunk_id,
                             "object": "chat.completion.chunk",