sakib-xeon
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+8-3Lines changed: 8 additions & 3 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+8-3Lines changed: 8 additions & 3 deletions
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+16-4Lines changed: 16 additions & 4 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+16-4Lines changed: 16 additions & 4 deletions
@@ -872,7 +872,7 @@ def _completion_response(text: str, finish_reason: Literal["stop", "length"], lo
                 break
 
             if stream:
-                remaining_tokens = completion_tokens[returned_tokens:]
+                remaining_tokens = completion_tokens[returned_tokens:-1]
                 remaining_text = self.detokenize(remaining_tokens)
                 remaining_length = len(remaining_text)
 
@@ -1030,9 +1030,14 @@ def _completion_response(text: str, finish_reason: Literal["stop", "length"], lo
                         break
                     returned_tokens += 1
                     yield _completion_stream_response(
-                        text=last_text[: len(last_text) - (token_end_position - end)].decode("utf-8", errors="ignore"), logprobs_or_none=logprobs_or_none
+                        text=last_text[: len(last_text) - (token_end_position - end)].decode("utf-8", errors="ignore"), logprobs_or_none=logprobs_or_none, finish_reason=finish_reason
                     )
-                    break
+                    if self.cache:
+                        if self.verbose:
+                            print("Llama._create_completion: cache save", file=sys.stderr)
+                        self.cache[prompt_tokens + completion_tokens] = self.save_state()
+                        print("Llama._create_completion: cache saved", file=sys.stderr)
+                    return
                 returned_tokens += 1
                 yield _completion_stream_response(
                     text=self.detokenize([token]).decode("utf-8", errors="ignore"), logprobs_or_none=logprobs_or_none
 
@@ -260,13 +260,25 @@ def _convert_text_completion_chunks_to_chat(
                     "index": 0,
                     "delta": {
                         "content": chunk["choices"][0]["text"],
-                    }
-                    if chunk["choices"][0]["finish_reason"] is None
-                    else {},
-                    "finish_reason": chunk["choices"][0]["finish_reason"],
+                    },
+                    "finish_reason": None,
                 }
             ],
         }
+        if chunk["choices"][0]["finish_reason"] is not None:
+            yield {
+                "id": "chat" + chunk["id"],
+                "model": chunk["model"],
+                "created": chunk["created"],
+                "object": "chat.completion.chunk",
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {},
+                        "finish_reason": chunk["choices"][0]["finish_reason"],
+                    }
+                ],
+            }
 
 
 def _convert_completion_to_chat(