@@ -153,7 +153,8 @@ def mock_kv_cache_seq_add(
153
153
154
154
def test_llama_patch (mock_llama ):
155
155
n_ctx = 128
156
- ai_service = "label-suggestions"
156
+ ai_service_completion = "test-label-suggestions"
157
+ ai_service_streaming = "test-acceptance-criteria"
157
158
llama = llama_cpp .Llama (model_path = MODEL , vocab_only = True , n_ctx = n_ctx )
158
159
159
160
n_vocab = llama_cpp .llama_n_vocab (llama ._model .model )
@@ -165,32 +166,32 @@ def test_llama_patch(mock_llama):
165
166
166
167
## Test basic completion from bos until eos
167
168
mock_llama (llama , all_text )
168
- completion = llama .create_completion ("" , max_tokens = 36 , ai_service = ai_service )
169
+ completion = llama .create_completion ("" , max_tokens = 36 , ai_service = ai_service_completion )
169
170
assert completion ["choices" ][0 ]["text" ] == all_text
170
171
assert completion ["choices" ][0 ]["finish_reason" ] == "stop"
171
172
172
173
## Test basic completion until eos
173
174
mock_llama (llama , all_text )
174
- completion = llama .create_completion (text , max_tokens = 20 , ai_service = ai_service )
175
+ completion = llama .create_completion (text , max_tokens = 20 , ai_service = ai_service_completion )
175
176
assert completion ["choices" ][0 ]["text" ] == output_text
176
177
assert completion ["choices" ][0 ]["finish_reason" ] == "stop"
177
178
178
179
## Test streaming completion until eos
179
180
mock_llama (llama , all_text )
180
- chunks = list (llama .create_completion (text , max_tokens = 20 , stream = True , ai_service = ai_service ))
181
+ chunks = list (llama .create_completion (text , max_tokens = 20 , stream = True , ai_service = ai_service_streaming ))
181
182
assert "" .join (chunk ["choices" ][0 ]["text" ] for chunk in chunks ) == output_text
182
183
assert chunks [- 1 ]["choices" ][0 ]["finish_reason" ] == "stop"
183
184
184
185
## Test basic completion until stop sequence
185
186
mock_llama (llama , all_text )
186
- completion = llama .create_completion (text , max_tokens = 20 , stop = ["lazy" ], ai_service = ai_service )
187
+ completion = llama .create_completion (text , max_tokens = 20 , stop = ["lazy" ], ai_service = ai_service_completion )
187
188
assert completion ["choices" ][0 ]["text" ] == " jumps over the "
188
189
assert completion ["choices" ][0 ]["finish_reason" ] == "stop"
189
190
190
191
## Test streaming completion until stop sequence
191
192
mock_llama (llama , all_text )
192
193
chunks = list (
193
- llama .create_completion (text , max_tokens = 20 , stream = True , stop = ["lazy" ], ai_service = ai_service )
194
+ llama .create_completion (text , max_tokens = 20 , stream = True , stop = ["lazy" ], ai_service = ai_service_streaming )
194
195
)
195
196
assert (
196
197
"" .join (chunk ["choices" ][0 ]["text" ] for chunk in chunks ) == " jumps over the "
@@ -199,13 +200,13 @@ def test_llama_patch(mock_llama):
199
200
200
201
## Test basic completion until length
201
202
mock_llama (llama , all_text )
202
- completion = llama .create_completion (text , max_tokens = 2 , ai_service = ai_service )
203
+ completion = llama .create_completion (text , max_tokens = 2 , ai_service = ai_service_completion )
203
204
assert completion ["choices" ][0 ]["text" ] == " jumps"
204
205
assert completion ["choices" ][0 ]["finish_reason" ] == "length"
205
206
206
207
## Test streaming completion until length
207
208
mock_llama (llama , all_text )
208
- chunks = list (llama .create_completion (text , max_tokens = 2 , stream = True , ai_service = ai_service ))
209
+ chunks = list (llama .create_completion (text , max_tokens = 2 , stream = True , ai_service = ai_service_streaming ))
209
210
assert "" .join (chunk ["choices" ][0 ]["text" ] for chunk in chunks ) == " jumps"
210
211
assert chunks [- 1 ]["choices" ][0 ]["finish_reason" ] == "length"
211
212
@@ -269,6 +270,22 @@ def test_llama_server():
269
270
}
270
271
271
272
273
+ def test_metrics_endpoint ():
274
+ from fastapi .testclient import TestClient
275
+ from llama_cpp .server .app import create_app , Settings
276
+
277
+ settings = Settings (
278
+ model = MODEL ,
279
+ vocab_only = True ,
280
+ )
281
+ app = create_app (settings )
282
+ client = TestClient (app )
283
+ response = client .get ("/metrics" )
284
+ assert response .status_code == 200
285
+ assert "test-label-suggestions" in response .text
286
+ assert "test-acceptance-criteria" in response .text
287
+
288
+
272
289
@pytest .mark .parametrize (
273
290
"size_and_axis" ,
274
291
[
0 commit comments