@@ -1281,6 +1281,14 @@ def logit_bias_processor(
1281
1281
1282
1282
token_end_position = 0
1283
1283
for token in remaining_tokens :
1284
+ # Record TTFT metric (once)
1285
+ if idx == 0 :
1286
+ _metrics_dict ["time_to_first_token" ] = time .time () - _ttft_start
1287
+ # Record TPOT metric
1288
+ else :
1289
+ _tpot_metrics .append (time .time () - _tpot_start )
1290
+ _tpot_start = time .time () # reset
1291
+
1284
1292
token_end_position += len (self .detokenize ([token ], prev_tokens = prompt_tokens + completion_tokens [:returned_tokens ]))
1285
1293
1286
1294
logprobs_or_none : Optional [CompletionLogprobs ] = None
@@ -1374,6 +1382,53 @@ def logit_bias_processor(
1374
1382
print ("Llama._create_completion: cache save" , file = sys .stderr )
1375
1383
self .cache [prompt_tokens + completion_tokens ] = self .save_state ()
1376
1384
print ("Llama._create_completion: cache saved" , file = sys .stderr )
1385
+
1386
+ ## PROMETHEUS METRICS IN STREAMING MODE ##
1387
+ # Record TTFT metric -- Setting to None if no tokens were generated
1388
+ if not _metrics_dict .get ("time_to_first_token" ):
1389
+ _metrics_dict ["time_to_first_token" ] = None
1390
+
1391
+ # Record TPOT metrics (per generated token)
1392
+ _metrics_dict ["time_per_output_token" ] = _tpot_metrics
1393
+
1394
+ # Record metrics from the C++ backend (converted to seconds)
1395
+ _timings = llama_cpp .llama_get_timings (self ._ctx .ctx )
1396
+ _metrics_dict ["load_time" ] = round (_timings .t_load_ms / 1e3 , 2 )
1397
+ _metrics_dict ["sample_time" ] = round (_timings .t_sample_ms / 1e3 , 2 )
1398
+ _metrics_dict ["sample_throughput" ] = round (1e3 / _timings .t_sample_ms * _timings .n_sample , 2 ) if _timings .t_sample_ms > 0 else 0.0
1399
+ _metrics_dict ["prompt_eval_time" ] = round (_timings .t_p_eval_ms / 1e3 , 2 )
1400
+ _metrics_dict ["prompt_eval_throughput" ] = round (1e3 / _timings .t_p_eval_ms * _timings .n_p_eval , 2 ) if _timings .t_p_eval_ms > 0 else 0.0
1401
+ _metrics_dict ["completion_eval_time" ] = round (_timings .t_eval_ms / 1e3 , 2 )
1402
+ _metrics_dict ["completion_eval_throughput" ] = round (1e3 / _timings .t_eval_ms * _timings .n_eval , 2 ) if _timings .t_eval_ms > 0 else 0.0
1403
+ _metrics_dict ["end_to_end_latency" ] = round ((_timings .t_end_ms - _timings .t_start_ms ) / 1e3 , 2 )
1404
+
1405
+ # Record prefill and generation token metrics
1406
+ _metrics_dict ["prefill_tokens" ] = len (prompt_tokens )
1407
+ _metrics_dict ["generation_tokens" ] = len (completion_tokens )
1408
+
1409
+ # Record system info
1410
+ _gpu_utilization , _gpu_memory_used , _gpu_memory_free = get_gpu_general_info ()
1411
+ _metrics_dict ["cpu_utilization" ] = get_cpu_usage (_pid ) # TODO: Returning always 0.0 -> check
1412
+ _metrics_dict ["cpu_ram_pid" ] = get_ram_usage (_pid )
1413
+ _metrics_dict ["gpu_utilization" ] = _gpu_utilization
1414
+ _metrics_dict ["gpu_ram_usage" ] = _gpu_memory_used
1415
+ _metrics_dict ["gpu_ram_free" ] = _gpu_memory_free
1416
+ _metrics_dict ["gpu_ram_pid" ] = get_gpu_info_by_pid (_pid )
1417
+ _metrics_dict ["state_size" ] = llama_cpp .llama_get_state_size (self ._ctx .ctx )
1418
+ _metrics_dict ["kv_cache_usage_ratio" ] = round (1. * llama_cpp .llama_get_kv_cache_used_cells (self ._ctx .ctx ) / self .n_ctx (), 2 )
1419
+ _metrics_dict ["system_info" ] = {
1420
+ "model" : model_name ,
1421
+ "n_params" : str (llama_cpp .llama_model_n_params (self .model )),
1422
+ "n_embd" : str (self .n_embd ()),
1423
+ "n_ctx" : str (self .n_ctx ()),
1424
+ "n_vocab" : str (self .n_vocab ()),
1425
+ "n_threads" : str (self .n_threads )
1426
+ }
1427
+
1428
+ # Log metrics to Prometheus
1429
+ _all_metrics = Metrics (** _metrics_dict )
1430
+ self .metrics .log_metrics (_all_metrics , labels = _labels )
1431
+
1377
1432
return
1378
1433
1379
1434
if self .cache :
@@ -1448,6 +1503,8 @@ def logit_bias_processor(
1448
1503
"token_logprobs" : token_logprobs ,
1449
1504
"top_logprobs" : top_logprobs ,
1450
1505
}
1506
+
1507
+ ## PROMETHEUS METRICS IN CHAT COMPLETION MODE ##
1451
1508
# Record TTFT metric -- Setting to None if no tokens were generated
1452
1509
if not _metrics_dict .get ("time_to_first_token" ):
1453
1510
_metrics_dict ["time_to_first_token" ] = None
0 commit comments