this-josh
diff --git a/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+27-16Lines changed: 27 additions & 16 deletions b/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+27-16Lines changed: 27 additions & 16 deletions
@@ -2,6 +2,7 @@
 
 import os
 import json
+import typing
 import contextlib
 
 from threading import Lock
@@ -88,11 +89,12 @@ def get_llama_proxy():
             llama_outer_lock.release()
 
 
-_ping_message_factory = None
+_ping_message_factory: typing.Optional[typing.Callable[[], bytes]] = None
 
-def set_ping_message_factory(factory):
-   global  _ping_message_factory
-   _ping_message_factory = factory
+
+def set_ping_message_factory(factory: typing.Callable[[], bytes]):
+    global _ping_message_factory
+    _ping_message_factory = factory
 
 
 def create_app(
@@ -155,20 +157,19 @@ def create_app(
 
 async def get_event_publisher(
     request: Request,
-    inner_send_chan: MemoryObjectSendStream,
-    iterator: Iterator,
-    on_complete=None,
+    inner_send_chan: MemoryObjectSendStream[typing.Any],
+    iterator: Iterator[typing.Any],
+    on_complete: typing.Optional[typing.Callable[[], None]] = None,
 ):
+    server_settings = next(get_server_settings())
+    interrupt_requests = server_settings.interrupt_requests if server_settings else False
     async with inner_send_chan:
         try:
             async for chunk in iterate_in_threadpool(iterator):
                 await inner_send_chan.send(dict(data=json.dumps(chunk)))
                 if await request.is_disconnected():
                     raise anyio.get_cancelled_exc_class()()
-                if (
-                    next(get_server_settings()).interrupt_requests
-                    and llama_outer_lock.locked()
-                ):
+                if interrupt_requests and llama_outer_lock.locked():
                     await inner_send_chan.send(dict(data="[DONE]"))
                     raise anyio.get_cancelled_exc_class()()
             await inner_send_chan.send(dict(data="[DONE]"))
@@ -268,6 +269,11 @@ async def create_completion(
     llama_proxy = await run_in_threadpool(
         lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)())
     )
+    if llama_proxy is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Service is not available",
+        )
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@@ -409,7 +415,7 @@ async def create_chat_completion(
                         {"role": "system", "content": "You are a helpful assistant."},
                         {"role": "user", "content": "Who won the world series in 2020"},
                     ],
-                    "response_format": { "type": "json_object" }
+                    "response_format": {"type": "json_object"},
                 },
             },
             "tool_calling": {
@@ -434,15 +440,15 @@ async def create_chat_completion(
                                     },
                                     "required": ["name", "age"],
                                 },
-                            }
+                            },
                         }
                     ],
                     "tool_choice": {
                         "type": "function",
                         "function": {
                             "name": "User",
-                        }
-                    }
+                        },
+                    },
                 },
             },
             "logprobs": {
@@ -454,7 +460,7 @@ async def create_chat_completion(
                         {"role": "user", "content": "What is the capital of France?"},
                     ],
                     "logprobs": True,
-                    "top_logprobs": 10
+                    "top_logprobs": 10,
                 },
             },
         }
@@ -468,6 +474,11 @@ async def create_chat_completion(
     llama_proxy = await run_in_threadpool(
         lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)())
     )
+    if llama_proxy is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Service is not available",
+        )
     exclude = {
         "n",
         "logit_bias_type",