ZenHubHQ
diff --git a/‎llama_cpp/_utils.py
Copy file name to clipboardExpand all lines: llama_cpp/_utils.py
+22-5Lines changed: 22 additions & 5 deletions b/‎llama_cpp/_utils.py
Copy file name to clipboardExpand all lines: llama_cpp/_utils.py
+22-5Lines changed: 22 additions & 5 deletions
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2-6Lines changed: 2 additions & 6 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2-6Lines changed: 2 additions & 6 deletions
diff --git a/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+18-11Lines changed: 18 additions & 11 deletions b/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+18-11Lines changed: 18 additions & 11 deletions
@@ -6,6 +6,9 @@
 
 from typing import Any, Dict, List, Tuple, Union
 
+from llama_cpp.llama_metrics import QueueMetrics, MetricsExporter
+
+
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 outnull_file = open(os.devnull, "w")
 errnull_file = open(os.devnull, "w")
@@ -147,7 +150,9 @@ def get_gpu_general_info() -> Tuple[float, float, float]:
     return 0.0, 0.0, 0.0
 
 
-async def monitor_task_queue(status_dict: Dict[str, Union[int, float]]):
+async def monitor_task_queue(
+    status_dict: Dict[str, Union[int, float]], metrics_exporter: MetricsExporter
+):
     """
     An asynchronous function that monitors the task queue and updates
     a shared status dictionary with the number of tasks that have not
@@ -160,6 +165,9 @@ async def monitor_task_queue(status_dict: Dict[str, Union[int, float]]):
     Any upcoming requests will be added to the task queue in the form of
     another RequestReponseCycle.run_asgi coroutine.
     """
+    if not isinstance(metrics_exporter, MetricsExporter):
+        raise ValueError("metrics_exporter must be an instance of MetricsExporter")
+    
     all_tasks = asyncio.all_tasks()
 
     # Get count of all running tasks
@@ -168,14 +176,23 @@ async def monitor_task_queue(status_dict: Dict[str, Union[int, float]]):
     # Get basic metadata of all running tasks
     status_dict["running_tasks"] = {
         task.get_name(): str(task.get_coro())
-        .encode("ascii", errors="ignore")
-        .strip()
-        .decode("ascii")
+        .lstrip("\u003C")
+        .rstrip("\u003E")
         for task in all_tasks
     }
 
+    assert status_dict is not None
+
+    # Register current running tasks as a Prometheus metric
+    _labels = {
+        "service": "general",
+        "request_type": "health_check",
+    }
+    _queue_metrics = QueueMetrics(**status_dict)
+    metrics_exporter.log_queue_metrics(_queue_metrics, _labels)
+
     await asyncio.sleep(5)  # adds a delay of 5 seconds to avoid overloading the CPU
 
     asyncio.create_task(
-        monitor_task_queue(status_dict)
+        monitor_task_queue(status_dict, metrics_exporter)
     )  # pass status_dict to the next task
@@ -43,7 +43,7 @@
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
-from llama_cpp.llama_metrics import RequestMetrics, MetricsExporter
+from llama_cpp.llama_metrics import RequestMetrics
 
 from llama_cpp._utils import (
     get_cpu_usage, 
@@ -74,7 +74,6 @@ class Llama:
     """High-level Python wrapper for a llama.cpp model."""
 
     __backend_initialized = False
-    __prometheus_metrics = MetricsExporter()
 
     def __init__(
         self,
@@ -488,10 +487,7 @@ def __init__(
             self.chat_format = "llama-2"
             if self.verbose:
                 print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
-
-        # Prometheus metrics
-        self.metrics = self.__prometheus_metrics
-
+                
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
 
@@ -48,7 +48,7 @@
 )
 from llama_cpp.server.errors import RouteErrorHandler
 from llama_cpp._utils import monitor_task_queue
-from llama_cpp.llama_metrics import QueueMetrics
+from llama_cpp.llama_metrics import MetricsExporter
 
 
 router = APIRouter(route_class=RouteErrorHandler)
@@ -102,14 +102,26 @@ def set_ping_message_factory(factory):
    _ping_message_factory = factory
 
 
+def set_metrics_exporter():
+    global metrics_exporter
+    try:
+        metrics_exporter
+    except NameError:
+        metrics_exporter = MetricsExporter()
+
+    return metrics_exporter
+
 task_queue_status = {}
 
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """
     A context manager that launches tasks to be run during the application's lifespan.
     """
-    await monitor_task_queue(task_queue_status)
+    metrics_exporter = set_metrics_exporter()
+
+    await monitor_task_queue(task_queue_status, metrics_exporter)
     yield
 
 
@@ -514,7 +526,7 @@ async def create_chat_completion(
     # Adds the ai_service value from the request body to the kwargs
     # to be passed downstream to the llama_cpp.ChatCompletion object
     kwargs["ai_service"] = body.ai_service
-
+    
     llama = llama_proxy(body.model)
     if body.logit_bias is not None:
         kwargs["logit_bias"] = (
@@ -523,14 +535,6 @@ async def create_chat_completion(
             else body.logit_bias
         )
 
-    # Register current running tasks as a Prometheus metric
-    _labels = {
-        "service": "general",
-        "request_type": "chat/completions",
-    }
-    _queue_metrics = QueueMetrics(**task_queue_status)
-    llama.metrics.log_queue_metrics(_queue_metrics, _labels)
-
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
@@ -543,6 +547,9 @@ async def create_chat_completion(
         else:
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
+    # Set the metrics exporter for the llama object
+    llama.metrics = set_metrics_exporter()
+    
     iterator_or_completion: Union[
         llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
     ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)