bmedi
diff --git a/‎examples/gradio_chat/local.py
Copy file name to clipboard
+59Lines changed: 59 additions & 0 deletions b/‎examples/gradio_chat/local.py
Copy file name to clipboard
+59Lines changed: 59 additions & 0 deletions
diff --git a/‎examples/gradio_chat/server.py
Copy file name to clipboard
+56Lines changed: 56 additions & 0 deletions b/‎examples/gradio_chat/server.py
Copy file name to clipboard
+56Lines changed: 56 additions & 0 deletions
@@ -0,0 +1,59 @@
+import llama_cpp
+import llama_cpp.llama_tokenizer
+
+import gradio as gr
+
+llama = llama_cpp.Llama.from_pretrained(
+    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
+    filename="*q8_0.gguf",
+    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
+    verbose=False
+)
+
+model = "gpt-3.5-turbo"
+
+def predict(message, history):
+    messages = []
+
+    for user_message, assistant_message in history:
+        messages.append({"role": "user", "content": user_message})
+        messages.append({"role": "assistant", "content": assistant_message})
+    
+    messages.append({"role": "user", "content": message})
+
+    response = llama.create_chat_completion_openai_v1(
+        model=model,
+        messages=messages,
+        stream=True
+    )
+
+    text = ""
+    for chunk in response:
+        content = chunk.choices[0].delta.content
+        if content:
+            text += content
+            yield text
+
+
+js = """function () {
+  gradioURL = window.location.href
+  if (!gradioURL.endsWith('?__theme=dark')) {
+    window.location.replace(gradioURL + '?__theme=dark');
+  }
+}"""
+
+css = """
+footer {
+    visibility: hidden;
+}
+full-height {
+    height: 100%;
+}
+"""
+
+with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo:
+    gr.ChatInterface(predict, fill_height=True, examples=["What is the capital of France?", "Who was the first person on the moon?"])
+
+
+if __name__ == "__main__":
+    demo.launch()
@@ -0,0 +1,56 @@
+import gradio as gr
+
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="llama.cpp"
+)
+
+model = "gpt-3.5-turbo"
+
+def predict(message, history):
+    messages = []
+
+    for user_message, assistant_message in history:
+        messages.append({"role": "user", "content": user_message})
+        messages.append({"role": "assistant", "content": assistant_message})
+    
+    messages.append({"role": "user", "content": message})
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stream=True
+    )
+
+    text = ""
+    for chunk in response:
+        content = chunk.choices[0].delta.content
+        if content:
+            text += content
+            yield text
+
+
+js = """function () {
+  gradioURL = window.location.href
+  if (!gradioURL.endsWith('?__theme=dark')) {
+    window.location.replace(gradioURL + '?__theme=dark');
+  }
+}"""
+
+css = """
+footer {
+    visibility: hidden;
+}
+full-height {
+    height: 100%;
+}
+"""
+
+with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo:
+    gr.ChatInterface(predict, fill_height=True, examples=["What is the capital of France?", "Who was the first person on the moon?"])
+
+
+if __name__ == "__main__":
+    demo.launch()