Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 308df3b

Browse filesBrowse files
ggerganovngxson
authored andcommitted
llama : llama_perf + option to disable timings during decode (ggml-org#9355)
* llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
1 parent 72ed342 commit 308df3b
Copy full SHA for 308df3b

File tree

Expand file treeCollapse file tree

23 files changed

+134
-90
lines changed
Filter options
Expand file treeCollapse file tree

23 files changed

+134
-90
lines changed

‎common/arg.cpp

Copy file name to clipboardExpand all lines: common/arg.cpp
+8Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
720720
params.prompt = value;
721721
}
722722
));
723+
add_opt(llama_arg(
724+
{"--no-perf"},
725+
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
726+
[](gpt_params & params) {
727+
params.no_perf = true;
728+
params.sparams.no_perf = true;
729+
}
730+
).set_env("LLAMA_ARG_NO_PERF"));
723731
add_opt(llama_arg(
724732
{"-f", "--file"}, "FNAME",
725733
"a file containing the prompt (default: none)",

‎common/common.cpp

Copy file name to clipboardExpand all lines: common/common.cpp
+2-1Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
820820
}
821821
llama_kv_cache_clear(lctx);
822822
llama_synchronize(lctx);
823-
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
823+
llama_perf_context_reset(lctx);
824824
}
825825

826826
iparams.model = model;
@@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
916916
cparams.cb_eval_user_data = params.cb_eval_user_data;
917917
cparams.offload_kqv = !params.no_kv_offload;
918918
cparams.flash_attn = params.flash_attn;
919+
cparams.no_perf = params.no_perf;
919920

920921
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
921922
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

‎common/common.h

Copy file name to clipboardExpand all lines: common/common.h
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ struct gpt_sampler_params {
124124
float mirostat_eta = 0.10f; // learning rate
125125
bool penalize_nl = false; // consider newlines as a repeatable token
126126
bool ignore_eos = false;
127+
bool no_perf = false; // disable performance metrics
127128

128129
std::vector<enum gpt_sampler_type> samplers = {
129130
GPT_SAMPLER_TYPE_TOP_K,
@@ -246,6 +247,7 @@ struct gpt_params {
246247
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
247248
bool cont_batching = true; // insert new sequences for decoding on-the-fly
248249
bool flash_attn = false; // flash attention
250+
bool no_perf = false; // disable performance metrics
249251

250252
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
251253
bool logits_all = false; // return logits for all tokens in the batch

‎common/sampling.cpp

Copy file name to clipboardExpand all lines: common/sampling.cpp
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
142142
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
143143
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
144144

145-
lparams.no_perf = false; // TODO: control via params
145+
lparams.no_perf = params.no_perf;
146146

147147
auto * result = new gpt_sampler {
148148
/* .params = */ params,
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
257257
// TODO: measure grammar performance
258258

259259
if (gsmpl) {
260-
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
260+
llama_perf_sampler_print(gsmpl->chain);
261261
}
262262
if (ctx) {
263-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
263+
llama_perf_context_print(ctx);
264264
}
265265
}
266266

‎examples/batched-bench/batched-bench.cpp

Copy file name to clipboardExpand all lines: examples/batched-bench/batched-bench.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
187187
}
188188

189189
LOG_TEE("\n");
190-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
190+
llama_perf_context_print(ctx);
191191

192192
llama_batch_free(batch);
193193

‎examples/batched.swift/Sources/main.swift

Copy file name to clipboardExpand all lines: examples/batched.swift/Sources/main.swift
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
200200

201201
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
202202

203-
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
204-
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
203+
llama_perf_sampler_print(smpl)
204+
llama_perf_context_print(context)
205205

206206
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
207207
let utf8Count = text.utf8.count

‎examples/batched/batched.cpp

Copy file name to clipboardExpand all lines: examples/batched/batched.cpp
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
229229
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
230230

231231
LOG_TEE("\n");
232-
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
233-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
232+
llama_perf_sampler_print(smpl);
233+
llama_perf_context_print(ctx);
234234

235235
fprintf(stderr, "\n");
236236

‎examples/embedding/embedding.cpp

Copy file name to clipboardExpand all lines: examples/embedding/embedding.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
306306
}
307307

308308
LOG_TEE("\n");
309-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
309+
llama_perf_context_print(ctx);
310310

311311
// clean up
312312
llama_batch_free(batch);

‎examples/eval-callback/eval-callback.cpp

Copy file name to clipboardExpand all lines: examples/eval-callback/eval-callback.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
182182
}
183183

184184
LOG_TEE("\n");
185-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
185+
llama_perf_context_print(ctx);
186186

187187
llama_free(ctx);
188188
llama_free_model(model);

‎examples/imatrix/imatrix.cpp

Copy file name to clipboardExpand all lines: examples/imatrix/imatrix.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
637637
g_collector.save_imatrix();
638638

639639
LOG_TEE("\n");
640-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
640+
llama_perf_context_print(ctx);
641641

642642
llama_free(ctx);
643643
llama_free_model(model);

‎examples/llama-bench/llama-bench.cpp

Copy file name to clipboardExpand all lines: examples/llama-bench/llama-bench.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
16301630
fflush(p_err->fout);
16311631
}
16321632

1633-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
1633+
llama_perf_context_print(ctx);
16341634

16351635
llama_free(ctx);
16361636

‎examples/llava/llava-cli.cpp

Copy file name to clipboardExpand all lines: examples/llava/llava-cli.cpp
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
308308
// process the prompt
309309
process_prompt(ctx_llava, image_embed, &params, params.prompt);
310310

311-
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
311+
llama_perf_context_print(ctx_llava->ctx_llama);
312312
llava_image_embed_free(image_embed);
313313
ctx_llava->model = NULL;
314314
llava_free(ctx_llava);
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
325325
// process the prompt
326326
process_prompt(ctx_llava, image_embed, &params, params.prompt);
327327

328-
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
328+
llama_perf_context_print(ctx_llava->ctx_llama);
329329
llava_image_embed_free(image_embed);
330330
ctx_llava->model = NULL;
331331
llava_free(ctx_llava);

‎examples/llava/minicpmv-cli.cpp

Copy file name to clipboardExpand all lines: examples/llava/minicpmv-cli.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
319319
}
320320
}
321321
printf("\n");
322-
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
322+
llama_perf_context_print(ctx_llava->ctx_llama);
323323

324324
ctx_llava->model = NULL;
325325
llava_free(ctx_llava);

‎examples/lookup/lookup.cpp

Copy file name to clipboardExpand all lines: examples/lookup/lookup.cpp
+1-2Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,7 @@ int main(int argc, char ** argv){
240240
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
241241

242242
LOG_TEE("\ntarget:\n\n");
243-
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
244-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
243+
gpt_perf_print(ctx, smpl);
245244

246245
gpt_sampler_free(smpl);
247246

‎examples/parallel/parallel.cpp

Copy file name to clipboardExpand all lines: examples/parallel/parallel.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
415415
LOG_TEE("\n");
416416

417417
// TODO: print sampling/grammar timings for all clients
418-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
418+
llama_perf_context_print(ctx);
419419

420420
llama_batch_free(batch);
421421

‎examples/passkey/passkey.cpp

Copy file name to clipboardExpand all lines: examples/passkey/passkey.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
256256
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
257257

258258
LOG_TEE("\n");
259-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
259+
llama_perf_context_print(ctx);
260260

261261
fprintf(stderr, "\n");
262262

‎examples/perplexity/perplexity.cpp

Copy file name to clipboardExpand all lines: examples/perplexity/perplexity.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) {
20472047
}
20482048

20492049
LOG_TEE("\n");
2050-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
2050+
llama_perf_context_print(ctx);
20512051
write_logfile(ctx, params, model, results);
20522052

20532053
llama_free(ctx);

‎examples/retrieval/retrieval.cpp

Copy file name to clipboardExpand all lines: examples/retrieval/retrieval.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
292292
}
293293

294294
LOG_TEE("\n");
295-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
295+
llama_perf_context_print(ctx);
296296

297297
// clean up
298298
llama_batch_free(query_batch);

‎examples/simple/simple.cpp

Copy file name to clipboardExpand all lines: examples/simple/simple.cpp
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
154154
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
155155

156156
LOG_TEE("\n");
157-
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
158-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
157+
llama_perf_sampler_print(smpl);
158+
llama_perf_context_print(ctx);
159159

160160
fprintf(stderr, "\n");
161161

‎examples/speculative/speculative.cpp

Copy file name to clipboardExpand all lines: examples/speculative/speculative.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
616616

617617
LOG_TEE("\ndraft:\n\n");
618618
// TODO: print sampling/grammar timings for all drafts
619-
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
619+
llama_perf_context_print(ctx_dft);
620620

621621
LOG_TEE("\ntarget:\n\n");
622622
gpt_perf_print(ctx_tgt, smpl);

‎include/llama.h

Copy file name to clipboardExpand all lines: include/llama.h
+23-6Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ extern "C" {
343343
bool embeddings; // if true, extract embeddings (together with logits)
344344
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
345345
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
346-
//bool no_perf; // whether to measure performance timings, TODO: implement
346+
bool no_perf; // whether to measure performance timings
347347

348348
// Abort callback
349349
// if it returns true, execution of llama_decode() will be aborted
@@ -1176,13 +1176,30 @@ extern "C" {
11761176
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
11771177
//
11781178

1179-
enum llama_perf_type {
1180-
LLAMA_PERF_TYPE_CONTEXT = 0,
1181-
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
1179+
struct llama_perf_context_data {
1180+
double t_start_ms;
1181+
double t_load_ms;
1182+
double t_p_eval_ms;
1183+
double t_eval_ms;
1184+
1185+
int32_t n_p_eval;
1186+
int32_t n_eval;
11821187
};
11831188

1184-
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
1185-
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
1189+
struct llama_perf_sampler_data {
1190+
double t_sample_ms;
1191+
1192+
int32_t n_sample;
1193+
};
1194+
1195+
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
1196+
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
1197+
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
1198+
1199+
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
1200+
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
1201+
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1202+
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
11861203

11871204
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
11881205

‎src/llama-sampling.cpp

Copy file name to clipboardExpand all lines: src/llama-sampling.cpp
+34Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1669,3 +1669,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
16691669

16701670
return LLAMA_DEFAULT_SEED;
16711671
}
1672+
1673+
// perf
1674+
1675+
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
1676+
struct llama_perf_sampler_data data = {};
1677+
1678+
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
1679+
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
1680+
}
1681+
1682+
const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
1683+
1684+
data.t_sample_ms = 1e-3 * ctx->t_sample_us;
1685+
data.n_sample = std::max(0, ctx->n_sample);
1686+
1687+
return data;
1688+
}
1689+
1690+
void llama_perf_sampler_print(const struct llama_sampler * chain) {
1691+
const auto data = llama_perf_sampler(chain);
1692+
1693+
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
1694+
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
1695+
}
1696+
1697+
void llama_perf_sampler_reset(struct llama_sampler * chain) {
1698+
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
1699+
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
1700+
}
1701+
1702+
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
1703+
1704+
ctx->t_sample_us = ctx->n_sample = 0;
1705+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.