Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit afd3472

Browse filesBrowse files
author
juanroesel
committed
Decouples MetricsExporter instance from Llama object
1 parent aab46da commit afd3472
Copy full SHA for afd3472

File tree

Expand file treeCollapse file tree

3 files changed

+42
-22
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+42
-22
lines changed

‎llama_cpp/_utils.py

Copy file name to clipboardExpand all lines: llama_cpp/_utils.py
+22-5Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
from typing import Any, Dict, List, Tuple, Union
88

9+
from llama_cpp.llama_metrics import QueueMetrics, MetricsExporter
10+
11+
912
# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
1013
outnull_file = open(os.devnull, "w")
1114
errnull_file = open(os.devnull, "w")
@@ -147,7 +150,9 @@ def get_gpu_general_info() -> Tuple[float, float, float]:
147150
return 0.0, 0.0, 0.0
148151

149152

150-
async def monitor_task_queue(status_dict: Dict[str, Union[int, float]]):
153+
async def monitor_task_queue(
154+
status_dict: Dict[str, Union[int, float]], metrics_exporter: MetricsExporter
155+
):
151156
"""
152157
An asynchronous function that monitors the task queue and updates
153158
a shared status dictionary with the number of tasks that have not
@@ -160,6 +165,9 @@ async def monitor_task_queue(status_dict: Dict[str, Union[int, float]]):
160165
Any upcoming requests will be added to the task queue in the form of
161166
another RequestReponseCycle.run_asgi coroutine.
162167
"""
168+
if not isinstance(metrics_exporter, MetricsExporter):
169+
raise ValueError("metrics_exporter must be an instance of MetricsExporter")
170+
163171
all_tasks = asyncio.all_tasks()
164172

165173
# Get count of all running tasks
@@ -168,14 +176,23 @@ async def monitor_task_queue(status_dict: Dict[str, Union[int, float]]):
168176
# Get basic metadata of all running tasks
169177
status_dict["running_tasks"] = {
170178
task.get_name(): str(task.get_coro())
171-
.encode("ascii", errors="ignore")
172-
.strip()
173-
.decode("ascii")
179+
.lstrip("\u003C")
180+
.rstrip("\u003E")
174181
for task in all_tasks
175182
}
176183

184+
assert status_dict is not None
185+
186+
# Register current running tasks as a Prometheus metric
187+
_labels = {
188+
"service": "general",
189+
"request_type": "health_check",
190+
}
191+
_queue_metrics = QueueMetrics(**status_dict)
192+
metrics_exporter.log_queue_metrics(_queue_metrics, _labels)
193+
177194
await asyncio.sleep(5) # adds a delay of 5 seconds to avoid overloading the CPU
178195

179196
asyncio.create_task(
180-
monitor_task_queue(status_dict)
197+
monitor_task_queue(status_dict, metrics_exporter)
181198
) # pass status_dict to the next task

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+2-6Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import llama_cpp.llama_cpp as llama_cpp
4444
import llama_cpp.llama_chat_format as llama_chat_format
4545

46-
from llama_cpp.llama_metrics import RequestMetrics, MetricsExporter
46+
from llama_cpp.llama_metrics import RequestMetrics
4747

4848
from llama_cpp._utils import (
4949
get_cpu_usage,
@@ -74,7 +74,6 @@ class Llama:
7474
"""High-level Python wrapper for a llama.cpp model."""
7575

7676
__backend_initialized = False
77-
__prometheus_metrics = MetricsExporter()
7877

7978
def __init__(
8079
self,
@@ -488,10 +487,7 @@ def __init__(
488487
self.chat_format = "llama-2"
489488
if self.verbose:
490489
print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
491-
492-
# Prometheus metrics
493-
self.metrics = self.__prometheus_metrics
494-
490+
495491
@property
496492
def ctx(self) -> llama_cpp.llama_context_p:
497493
assert self._ctx.ctx is not None

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+18-11Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
)
4949
from llama_cpp.server.errors import RouteErrorHandler
5050
from llama_cpp._utils import monitor_task_queue
51-
from llama_cpp.llama_metrics import QueueMetrics
51+
from llama_cpp.llama_metrics import MetricsExporter
5252

5353

5454
router = APIRouter(route_class=RouteErrorHandler)
@@ -102,14 +102,26 @@ def set_ping_message_factory(factory):
102102
_ping_message_factory = factory
103103

104104

105+
def set_metrics_exporter():
106+
global metrics_exporter
107+
try:
108+
metrics_exporter
109+
except NameError:
110+
metrics_exporter = MetricsExporter()
111+
112+
return metrics_exporter
113+
105114
task_queue_status = {}
106115

116+
107117
@asynccontextmanager
108118
async def lifespan(app: FastAPI):
109119
"""
110120
A context manager that launches tasks to be run during the application's lifespan.
111121
"""
112-
await monitor_task_queue(task_queue_status)
122+
metrics_exporter = set_metrics_exporter()
123+
124+
await monitor_task_queue(task_queue_status, metrics_exporter)
113125
yield
114126

115127

@@ -514,7 +526,7 @@ async def create_chat_completion(
514526
# Adds the ai_service value from the request body to the kwargs
515527
# to be passed downstream to the llama_cpp.ChatCompletion object
516528
kwargs["ai_service"] = body.ai_service
517-
529+
518530
llama = llama_proxy(body.model)
519531
if body.logit_bias is not None:
520532
kwargs["logit_bias"] = (
@@ -523,14 +535,6 @@ async def create_chat_completion(
523535
else body.logit_bias
524536
)
525537

526-
# Register current running tasks as a Prometheus metric
527-
_labels = {
528-
"service": "general",
529-
"request_type": "chat/completions",
530-
}
531-
_queue_metrics = QueueMetrics(**task_queue_status)
532-
llama.metrics.log_queue_metrics(_queue_metrics, _labels)
533-
534538
if body.grammar is not None:
535539
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
536540

@@ -543,6 +547,9 @@ async def create_chat_completion(
543547
else:
544548
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
545549

550+
# Set the metrics exporter for the llama object
551+
llama.metrics = set_metrics_exporter()
552+
546553
iterator_or_completion: Union[
547554
llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
548555
] = await run_in_threadpool(llama.create_chat_completion, **kwargs)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.