Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 67ae74a

Browse filesBrowse files
author
juanroesel
committed
Fixed bug of llama metrics not collecting during streaming
1 parent bfefa26 commit 67ae74a
Copy full SHA for 67ae74a

File tree

Expand file treeCollapse file tree

4 files changed

+65
-4
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+65
-4
lines changed

‎llama_cpp/llama.py

Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+57Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,6 +1281,14 @@ def logit_bias_processor(
12811281

12821282
token_end_position = 0
12831283
for token in remaining_tokens:
1284+
# Record TTFT metric (once)
1285+
if idx == 0:
1286+
_metrics_dict["time_to_first_token"] = time.time() - _ttft_start
1287+
# Record TPOT metric
1288+
else:
1289+
_tpot_metrics.append(time.time() - _tpot_start)
1290+
_tpot_start = time.time() # reset
1291+
12841292
token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))
12851293

12861294
logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -1374,6 +1382,53 @@ def logit_bias_processor(
13741382
print("Llama._create_completion: cache save", file=sys.stderr)
13751383
self.cache[prompt_tokens + completion_tokens] = self.save_state()
13761384
print("Llama._create_completion: cache saved", file=sys.stderr)
1385+
1386+
## PROMETHEUS METRICS IN STREAMING MODE ##
1387+
# Record TTFT metric -- Setting to None if no tokens were generated
1388+
if not _metrics_dict.get("time_to_first_token"):
1389+
_metrics_dict["time_to_first_token"] = None
1390+
1391+
# Record TPOT metrics (per generated token)
1392+
_metrics_dict["time_per_output_token"] = _tpot_metrics
1393+
1394+
# Record metrics from the C++ backend (converted to seconds)
1395+
_timings = llama_cpp.llama_get_timings(self._ctx.ctx)
1396+
_metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
1397+
_metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
1398+
_metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
1399+
_metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
1400+
_metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
1401+
_metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
1402+
_metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
1403+
_metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)
1404+
1405+
# Record prefill and generation token metrics
1406+
_metrics_dict["prefill_tokens"] = len(prompt_tokens)
1407+
_metrics_dict["generation_tokens"] = len(completion_tokens)
1408+
1409+
# Record system info
1410+
_gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
1411+
_metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check
1412+
_metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
1413+
_metrics_dict["gpu_utilization"] = _gpu_utilization
1414+
_metrics_dict["gpu_ram_usage"] = _gpu_memory_used
1415+
_metrics_dict["gpu_ram_free"] = _gpu_memory_free
1416+
_metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
1417+
_metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
1418+
_metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
1419+
_metrics_dict["system_info"] = {
1420+
"model": model_name,
1421+
"n_params": str(llama_cpp.llama_model_n_params(self.model)),
1422+
"n_embd": str(self.n_embd()),
1423+
"n_ctx": str(self.n_ctx()),
1424+
"n_vocab": str(self.n_vocab()),
1425+
"n_threads": str(self.n_threads)
1426+
}
1427+
1428+
# Log metrics to Prometheus
1429+
_all_metrics = Metrics(**_metrics_dict)
1430+
self.metrics.log_metrics(_all_metrics, labels=_labels)
1431+
13771432
return
13781433

13791434
if self.cache:
@@ -1448,6 +1503,8 @@ def logit_bias_processor(
14481503
"token_logprobs": token_logprobs,
14491504
"top_logprobs": top_logprobs,
14501505
}
1506+
1507+
## PROMETHEUS METRICS IN CHAT COMPLETION MODE ##
14511508
# Record TTFT metric -- Setting to None if no tokens were generated
14521509
if not _metrics_dict.get("time_to_first_token"):
14531510
_metrics_dict["time_to_first_token"] = None

‎llama_cpp/llama_chat_format.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1718,6 +1718,7 @@ def functionary_v1_v2_chat_handler(
17181718
model: Optional[str] = None,
17191719
logits_processor: Optional[llama.LogitsProcessorList] = None,
17201720
grammar: Optional[llama.LlamaGrammar] = None,
1721+
ai_service: Optional[str] = None,
17211722
**kwargs, # type: ignore
17221723
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
17231724
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -1934,6 +1935,7 @@ def prepare_messages_for_inference(
19341935
model=model,
19351936
logits_processor=logits_processor,
19361937
grammar=grammar,
1938+
ai_service=ai_service
19371939
)
19381940
if stream is False:
19391941
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()

‎llama_cpp/llama_metrics.py

Copy file name to clipboardExpand all lines: llama_cpp/llama_metrics.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class MetricsExporter:
4545
def __init__(self):
4646
self.labels = LABELS
4747
# One-time metrics
48-
self._histrogram_load_time = Histogram(
48+
self._histogram_load_time = Histogram(
4949
name="llama_cpp_python:load_t_seconds",
5050
documentation="Histogram of load time in seconds",
5151
labelnames=self.labels,
@@ -194,7 +194,7 @@ def log_metrics(self, metrics: Metrics, labels: Dict[str, str]):
194194
"""
195195
Log the metrics using the Prometheus client.
196196
"""
197-
self._histrogram_load_time.labels(**labels).observe(metrics.load_time)
197+
self._histogram_load_time.labels(**labels).observe(metrics.load_time)
198198
self._histogram_sample_time.labels(**labels).observe(metrics.sample_time)
199199
if metrics.time_to_first_token:
200200
self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)

‎llama_cpp/server/app.py

Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+4-2Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -455,8 +455,11 @@ async def create_chat_completion(
455455
"user",
456456
}
457457
kwargs = body.model_dump(exclude=exclude)
458+
458459
# Adds the ai_service value from the request body to the kwargs
460+
# to be passed downstream to the llama_cpp.ChatCompletion object
459461
kwargs["ai_service"] = body.ai_service
462+
460463
llama = llama_proxy(body.model)
461464
if body.logit_bias is not None:
462465
kwargs["logit_bias"] = (
@@ -474,8 +477,7 @@ async def create_chat_completion(
474477

475478
if isinstance(iterator_or_completion, Iterator):
476479
# EAFP: It's easier to ask for forgiveness than permission
477-
# NOTE: Including kwargs so it can also pass the "ai_service" argument to the iterator
478-
first_response = await run_in_threadpool(next, iterator_or_completion, **kwargs)
480+
first_response = await run_in_threadpool(next, iterator_or_completion)
479481

480482
# If no exception was raised from first_response, we can assume that
481483
# the iterator is valid and we can use it to stream the response.

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.