damian0815
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+5-5Lines changed: 5 additions & 5 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+5-5Lines changed: 5 additions & 5 deletions
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+39-7Lines changed: 39 additions & 7 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+39-7Lines changed: 39 additions & 7 deletions
diff --git a/‎llama_cpp/llama_types.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_types.py
+99-39Lines changed: 99 additions & 39 deletions b/‎llama_cpp/llama_types.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_types.py
+99-39Lines changed: 99 additions & 39 deletions
diff --git a/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+2-2Lines changed: 2 additions & 2 deletions b/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+2-2Lines changed: 2 additions & 2 deletions
@@ -1231,7 +1231,7 @@ def create_embedding(
         else:
             inputs = input
 
-        data: List[EmbeddingData] = []
+        data: List[Embedding] = []
         total_tokens = 0
         for index, input in enumerate(inputs):
             tokens = self.tokenize(input.encode("utf-8"), special=True)
@@ -1297,7 +1297,7 @@ def _create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
+    ) -> Union[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
@@ -1753,7 +1753,7 @@ def create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
@@ -1800,7 +1800,7 @@ def create_completion(
             grammar=grammar,
         )
         if stream:
-            chunks: Iterator[CompletionChunk] = completion_or_chunks
+            chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
             return chunks
         completion: Completion = next(completion_or_chunks)  # type: ignore
         return completion
@@ -1828,7 +1828,7 @@ def __call__(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
 
@@ -199,7 +199,7 @@ def _convert_text_completion_to_chat(
 
 
 def _convert_text_completion_chunks_to_chat(
-    chunks: Iterator[llama_types.CompletionChunk],
+    chunks: Iterator[llama_types.CreateCompletionStreamResponse],
 ) -> Iterator[llama_types.ChatCompletionChunk]:
     for i, chunk in enumerate(chunks):
         if i == 0:
@@ -239,12 +239,12 @@ def _convert_text_completion_chunks_to_chat(
 
 def _convert_completion_to_chat(
     completion_or_chunks: Union[
-        llama_types.Completion, Iterator[llama_types.CompletionChunk]
+        llama_types.CreateCompletionResponse, Iterator[llama_types.CreateCompletionStreamResponse]
     ],
     stream: bool = False,
-) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]]:
     if stream:
-        chunks: Iterator[llama_types.CompletionChunk] = completion_or_chunks  # type: ignore
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
         return _convert_text_completion_chunks_to_chat(chunks)
     else:
         completion: llama_types.Completion = completion_or_chunks  # type: ignore
@@ -613,13 +613,13 @@ def prepare_messages_for_inference(
         all_messages: List[llama_types.ChatCompletionRequestMessage] = []
         if functions is not None:
             all_messages.append(
-                llama_types.ChatCompletionRequestMessage(
+                llama_types.ChatCompletionRequestSystemMessage(
                     role="system", content=generate_schema_from_functions(functions)
                 )
             )
 
         all_messages.append(
-            llama_types.ChatCompletionRequestMessage(
+            llama_types.ChatCompletionRequestSystemMessage(
                 role="system", content=SYSTEM_MESSAGE
             )
         )
@@ -636,7 +636,7 @@ def prepare_messages_for_inference(
             all_messages.append(message)
 
         all_messages.append(
-            llama_types.ChatCompletionRequestMessage(role="assistant", content=None)
+            llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content=None)
         )
 
         def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
@@ -734,3 +734,35 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         ],
         usage=completion["usage"],
     )
+
+
+@register_chat_completion_handler("llava-1.5")
+def lava_1_5_chat_handler(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    max_tokens: int = 256,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+    # convert messages into a list of strings and images objects
+    # for each item in list
+    #  if string, process it and append to prompt
+    #  if image, evaluate it and add empty string to prompt (for now)
+    # generate completion
+    items = []
+    current_prompt = ""
@@ -1,5 +1,7 @@
 """Types and request signatures for OpenAI compatibility
 
+NOTE: These types may change to match the OpenAI OpenAPI specification.
+
 Based on the OpenAI OpenAPI specification:
 https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 
@@ -19,9 +21,6 @@ class Embedding(TypedDict):
     embedding: List[float]
 
 
-EmbeddingData = Embedding
-
-
 class CreateEmbeddingResponse(TypedDict):
     object: Literal["list"]
     model: str
@@ -57,9 +56,6 @@ class CreateCompletionStreamResponse(TypedDict):
     choices: List[CompletionChoice]
 
 
-CompletionChunk = CreateCompletionStreamResponse
-
-
 class CreateCompletionResponse(TypedDict):
     id: str
     object: Literal["text_completion"]
@@ -69,9 +65,6 @@ class CreateCompletionResponse(TypedDict):
     usage: CompletionUsage
 
 
-Completion = CreateCompletionResponse
-
-
 class ChatCompletionFunctionCall(TypedDict):
     name: str
     arguments: str
@@ -100,73 +93,58 @@ class ChatCompletionResponseMessage(TypedDict):
     function_call: NotRequired[ChatCompletionFunctionCall]
 
 
-ChatCompletionMessage = ChatCompletionResponseMessage
-
-
 class ChatCompletionResponseFunction(TypedDict):
     name: str
     description: NotRequired[str]
     parameters: Dict[str, Any]  # TODO: make this more specific
 
 
-ChatCompletionFunction = ChatCompletionResponseFunction
-
-
 class ChatCompletionResponseChoice(TypedDict):
     index: int
-    message: ChatCompletionMessage
+    message: "ChatCompletionMessage"
     finish_reason: Optional[str]
 
 
-ChatCompletionChoice = ChatCompletionResponseChoice
-
-
 class CreateChatCompletionResponse(TypedDict):
     id: str
     object: Literal["chat.completion"]
     created: int
     model: str
-    choices: List[ChatCompletionChoice]
+    choices: List["ChatCompletionChoice"]
     usage: CompletionUsage
 
 
-ChatCompletion = CreateChatCompletionResponse
+class ChatCompletionMessageToolCallChunk(TypedDict):
+    index: int
+    id: NotRequired[str]
+    type: Literal["function"]
+    function: ChatCompletionFunctionCall
 
 
 class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
     pass
 
 
-ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
-
-
 class ChatCompletionStreamResponseDelta(TypedDict):
-    role: NotRequired[Literal["assistant"]]
     content: NotRequired[str]
     function_call: NotRequired[ChatCompletionFunctionCall]
-
-
-ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
+    tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
+    role: NotRequired[Literal["system", "user", "assistant", "tool"]]
 
 
 class ChatCompletionStreamResponseChoice(TypedDict):
     index: int
-    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
+    delta: Union["ChatCompletionChunkDelta", "ChatCompletionChunkDeltaEmpty"]
     finish_reason: Optional[Literal["stop", "length", "function_call"]]
 
 
-ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
-
-
 class ChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
-    choices: List[ChatCompletionChunkChoice]
-
+    choices: List["ChatCompletionChunkChoice"]
 
-ChatCompletionChunk = ChatCompletionStreamResponse
 
 JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
 
@@ -181,8 +159,90 @@ class ChatCompletionFunctionCallOption(TypedDict):
     name: str
 
 
-class ChatCompletionRequestMessage(TypedDict):
-    role: Literal["assistant", "user", "system", "function"]
+class ChatCompletionRequestMessageContentPartText(TypedDict):
+    type: Literal["text"]
+    text: str
+
+
+class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
+    url: str
+    detail: NotRequired[Literal["auto", "low", "high"]]
+
+
+class ChatCompletionRequestMessageContentPartImage(TypedDict):
+    type: Literal["image_url"]
+    image_url: ChatCompletionRequestMessageContentPartImageImageUrl
+
+
+ChatCompletionRequestMessageContentPart = Union[
+    ChatCompletionRequestMessageContentPartText,
+    ChatCompletionRequestMessageContentPartImage,
+]
+
+
+class ChatCompletionRequestSystemMessage(TypedDict):
+    role: Literal["system"]
     content: Optional[str]
-    name: NotRequired[str]
-    function_call: NotRequired[ChatCompletionFunctionCall]
+
+
+class ChatCompletionRequestUserMessage(TypedDict):
+    role: Literal["user"]
+    content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
+
+
+class ChatCompletionMessageToolCallFunction(TypedDict):
+    name: str
+    arguments: str
+
+
+class ChatCompletionMessageToolCall(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: ChatCompletionMessageToolCallFunction
+
+
+ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
+
+
+class ChatCompletionRequestAssistantMessage(TypedDict):
+    role: Literal["assistant"]
+    content: Optional[str]
+    tool_calls: NotRequired[ChatCompletionMessageToolCalls]
+    function_call: NotRequired[ChatCompletionFunctionCall]  # DEPRECATED
+
+
+class ChatCompletionRequestToolMessage(TypedDict):
+    role: Literal["tool"]
+    content: Optional[str]
+    tool_call_id: str
+
+
+class ChatCompletionRequestFunctionMessage(TypedDict):
+    role: Literal["function"]
+    content: Optional[str]
+    name: str
+
+
+ChatCompletionRequestMessage = Union[
+    ChatCompletionRequestSystemMessage,
+    ChatCompletionRequestUserMessage,
+    ChatCompletionRequestAssistantMessage,
+    ChatCompletionRequestUserMessage,
+    ChatCompletionRequestToolMessage,
+    ChatCompletionRequestFunctionMessage,
+]
+
+# NOTE: The following type names are not part of the OpenAI OpenAPI specification
+# and will be removed in a future major release.
+
+EmbeddingData = Embedding
+CompletionChunk = CreateCompletionStreamResponse
+Completion = CreateCompletionResponse
+ChatCompletionMessage = ChatCompletionResponseMessage
+ChatCompletionChoice = ChatCompletionResponseChoice
+ChatCompletion = CreateChatCompletionResponse
+ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
+ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
+ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
+ChatCompletionChunk = ChatCompletionStreamResponse
+ChatCompletionFunction = ChatCompletionResponseFunction
@@ -688,7 +688,7 @@ async def create_completion(
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
     iterator_or_completion: Union[
-        llama_cpp.Completion, Iterator[llama_cpp.CompletionChunk]
+        llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse]
     ] = await run_in_threadpool(llama, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):
@@ -697,7 +697,7 @@ async def create_completion(
 
         # If no exception was raised from first_response, we can assume that
         # the iterator is valid and we can use it to stream the response.
-        def iterator() -> Iterator[llama_cpp.CompletionChunk]:
+        def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
             yield first_response
             yield from iterator_or_completion