ZenHubHQ
diff --git a/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+57Lines changed: 57 additions & 0 deletions b/‎llama_cpp/llama.py
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+57Lines changed: 57 additions & 0 deletions
diff --git a/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+2Lines changed: 2 additions & 0 deletions b/‎llama_cpp/llama_chat_format.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_chat_format.py
+2Lines changed: 2 additions & 0 deletions
diff --git a/‎llama_cpp/llama_metrics.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_metrics.py
+2-2Lines changed: 2 additions & 2 deletions b/‎llama_cpp/llama_metrics.py
Copy file name to clipboardExpand all lines: llama_cpp/llama_metrics.py
+2-2Lines changed: 2 additions & 2 deletions
diff --git a/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+4-2Lines changed: 4 additions & 2 deletions b/‎llama_cpp/server/app.py
Copy file name to clipboardExpand all lines: llama_cpp/server/app.py
+4-2Lines changed: 4 additions & 2 deletions
@@ -1281,6 +1281,14 @@ def logit_bias_processor(
 
             token_end_position = 0
             for token in remaining_tokens:
+                # Record TTFT metric (once)
+                if idx == 0:
+                    _metrics_dict["time_to_first_token"] = time.time() - _ttft_start
+                # Record TPOT metric
+                else:
+                    _tpot_metrics.append(time.time() - _tpot_start)
+                _tpot_start = time.time()  # reset
+
                 token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -1374,6 +1382,53 @@ def logit_bias_processor(
                     print("Llama._create_completion: cache save", file=sys.stderr)
                 self.cache[prompt_tokens + completion_tokens] = self.save_state()
                 print("Llama._create_completion: cache saved", file=sys.stderr)
+            
+            ## PROMETHEUS METRICS IN STREAMING MODE ##
+            # Record TTFT metric -- Setting to None if no tokens were generated
+            if not _metrics_dict.get("time_to_first_token"):
+                _metrics_dict["time_to_first_token"] = None
+
+            # Record TPOT metrics (per generated token)
+            _metrics_dict["time_per_output_token"] = _tpot_metrics
+
+            # Record metrics from the C++ backend (converted to seconds)
+            _timings = llama_cpp.llama_get_timings(self._ctx.ctx)
+            _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2)
+            _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2)
+            _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0
+            _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2)
+            _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0
+            _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2)
+            _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0
+            _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2)
+
+            # Record prefill and generation token metrics
+            _metrics_dict["prefill_tokens"] = len(prompt_tokens)
+            _metrics_dict["generation_tokens"] = len(completion_tokens)
+
+            # Record system info
+            _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info()
+            _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid)  # TODO: Returning always 0.0 -> check
+            _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid)
+            _metrics_dict["gpu_utilization"] = _gpu_utilization
+            _metrics_dict["gpu_ram_usage"] = _gpu_memory_used
+            _metrics_dict["gpu_ram_free"] = _gpu_memory_free
+            _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid)
+            _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx)
+            _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2)
+            _metrics_dict["system_info"] = {
+                "model": model_name,
+                "n_params": str(llama_cpp.llama_model_n_params(self.model)),
+                "n_embd": str(self.n_embd()),
+                "n_ctx": str(self.n_ctx()),
+                "n_vocab": str(self.n_vocab()),
+                "n_threads": str(self.n_threads)
+            } 
+
+            # Log metrics to Prometheus
+            _all_metrics = Metrics(**_metrics_dict)
+            self.metrics.log_metrics(_all_metrics, labels=_labels)
+            
             return
 
         if self.cache:
@@ -1448,6 +1503,8 @@ def logit_bias_processor(
                 "token_logprobs": token_logprobs,
                 "top_logprobs": top_logprobs,
             }
+        
+        ## PROMETHEUS METRICS IN CHAT COMPLETION MODE ##
         # Record TTFT metric -- Setting to None if no tokens were generated
         if not _metrics_dict.get("time_to_first_token"):
             _metrics_dict["time_to_first_token"] = None
 
@@ -1718,6 +1718,7 @@ def functionary_v1_v2_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    ai_service: Optional[str] = None,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -1934,6 +1935,7 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            ai_service=ai_service
         )
         if stream is False:
             completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
 
@@ -45,7 +45,7 @@ class MetricsExporter:
     def __init__(self):
         self.labels = LABELS
         # One-time metrics
-        self._histrogram_load_time = Histogram(
+        self._histogram_load_time = Histogram(
             name="llama_cpp_python:load_t_seconds",
             documentation="Histogram of load time in seconds",
             labelnames=self.labels,
@@ -194,7 +194,7 @@ def log_metrics(self, metrics: Metrics, labels: Dict[str, str]):
         """
         Log the metrics using the Prometheus client.
         """
-        self._histrogram_load_time.labels(**labels).observe(metrics.load_time)
+        self._histogram_load_time.labels(**labels).observe(metrics.load_time)
         self._histogram_sample_time.labels(**labels).observe(metrics.sample_time)
         if metrics.time_to_first_token:
             self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token)
 
@@ -455,8 +455,11 @@ async def create_chat_completion(
         "user",
     }
     kwargs = body.model_dump(exclude=exclude)
+    
     # Adds the ai_service value from the request body to the kwargs
+    # to be passed downstream to the llama_cpp.ChatCompletion object
     kwargs["ai_service"] = body.ai_service
+
     llama = llama_proxy(body.model)
     if body.logit_bias is not None:
         kwargs["logit_bias"] = (
@@ -474,8 +477,7 @@ async def create_chat_completion(
 
     if isinstance(iterator_or_completion, Iterator):
         # EAFP: It's easier to ask for forgiveness than permission
-        # NOTE: Including kwargs so it can also pass the "ai_service" argument to the iterator
-        first_response = await run_in_threadpool(next, iterator_or_completion, **kwargs)
+        first_response = await run_in_threadpool(next, iterator_or_completion)
 
         # If no exception was raised from first_response, we can assume that
         # the iterator is valid and we can use it to stream the response.