Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit edc17cf

Browse filesBrowse files
committed
Remove direct access to std streams from llama_main
The goal is to allow running llama_main while connected to other streams, such as TCP sockets. Signed-off-by: Thiago Padilha <thiago@padilha.cc>
1 parent 1088d2d commit edc17cf
Copy full SHA for edc17cf

File tree

Expand file treeCollapse file tree

3 files changed

+41
-35
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+41
-35
lines changed

‎llama.cpp

Copy file name to clipboardExpand all lines: llama.cpp
+36-33Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -718,13 +718,16 @@ int llama_main(
718718
gpt_vocab vocab,
719719
llama_model model,
720720
int64_t t_load_us,
721-
int64_t t_main_start_us) {
721+
int64_t t_main_start_us,
722+
FILE *instream,
723+
FILE *outstream,
724+
FILE *errstream) {
722725

723726
if (params.seed < 0) {
724727
params.seed = time(NULL);
725728
}
726729

727-
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
730+
fprintf(errstream, "%s: seed = %d\n", __func__, params.seed);
728731

729732
std::mt19937 rng(params.seed);
730733
if (params.prompt.empty()) {
@@ -751,13 +754,13 @@ int llama_main(
751754
// tokenize the reverse prompt
752755
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
753756

754-
fprintf(stderr, "\n");
755-
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
756-
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
757+
fprintf(errstream, "\n");
758+
fprintf(errstream, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
759+
fprintf(errstream, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
757760
for (int i = 0; i < (int) embd_inp.size(); i++) {
758-
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
761+
fprintf(errstream, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
759762
}
760-
fprintf(stderr, "\n");
763+
fprintf(errstream, "\n");
761764
if (params.interactive) {
762765
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
763766
struct sigaction sigint_action;
@@ -769,19 +772,19 @@ int llama_main(
769772
signal(SIGINT, sigint_handler);
770773
#endif
771774

772-
fprintf(stderr, "%s: interactive mode on.\n", __func__);
775+
fprintf(errstream, "%s: interactive mode on.\n", __func__);
773776

774777
if(antiprompt_inp.size()) {
775-
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
776-
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
778+
fprintf(errstream, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
779+
fprintf(errstream, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
777780
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
778-
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
781+
fprintf(errstream, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
779782
}
780-
fprintf(stderr, "\n");
783+
fprintf(errstream, "\n");
781784
}
782785
}
783-
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
784-
fprintf(stderr, "\n\n");
786+
fprintf(errstream, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
787+
fprintf(errstream, "\n\n");
785788

786789
std::vector<gpt_vocab::id> embd;
787790

@@ -795,7 +798,7 @@ int llama_main(
795798

796799

797800
if (params.interactive) {
798-
fprintf(stderr, "== Running in interactive mode. ==\n"
801+
fprintf(errstream, "== Running in interactive mode. ==\n"
799802
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
800803
" - Press Ctrl+C to interject at any time.\n"
801804
#endif
@@ -814,7 +817,7 @@ int llama_main(
814817

815818
// set the color for the prompt which will be output initially
816819
if (params.use_color) {
817-
printf(ANSI_COLOR_YELLOW);
820+
fprintf(outstream, ANSI_COLOR_YELLOW);
818821
}
819822

820823
while (remaining_tokens > 0) {
@@ -823,7 +826,7 @@ int llama_main(
823826
const int64_t t_start_us = ggml_time_us();
824827

825828
if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
826-
fprintf(stderr, "Failed to predict\n");
829+
fprintf(errstream, "Failed to predict\n");
827830
return 1;
828831
}
829832

@@ -877,16 +880,16 @@ int llama_main(
877880

878881
// reset color to default if we there is no pending user input
879882
if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
880-
printf(ANSI_COLOR_RESET);
883+
fprintf(outstream, ANSI_COLOR_RESET);
881884
}
882885
}
883886

884887
// display text
885888
if (!input_noecho) {
886889
for (auto id : embd) {
887-
printf("%s", vocab.id_to_token[id].c_str());
890+
fprintf(outstream, "%s", vocab.id_to_token[id].c_str());
888891
}
889-
fflush(stdout);
892+
fflush(outstream);
890893
}
891894

892895
// in interactive mode, and not currently processing queued inputs;
@@ -901,16 +904,16 @@ int llama_main(
901904
// currently being interactive
902905
bool another_line=true;
903906
while (another_line) {
904-
fflush(stdout);
907+
fflush(outstream);
905908
char buf[256] = {0};
906909
int n_read;
907-
if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
908-
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
910+
if(params.use_color) fprintf(outstream, ANSI_BOLD ANSI_COLOR_GREEN);
911+
if (fscanf(instream, "%255[^\n]%n%*c", buf, &n_read) <= 0) {
909912
// presumable empty line, consume the newline
910-
std::ignore = scanf("%*c");
913+
std::ignore = fscanf(instream, "%*c");
911914
n_read=0;
912915
}
913-
if(params.use_color) printf(ANSI_COLOR_RESET);
916+
if(params.use_color) fprintf(outstream, ANSI_COLOR_RESET);
914917

915918
if (n_read > 0 && buf[n_read-1]=='\\') {
916919
another_line = true;
@@ -936,7 +939,7 @@ int llama_main(
936939

937940
// end of text token
938941
if (embd.back() == 2) {
939-
fprintf(stderr, " [end of text]\n");
942+
fprintf(errstream, " [end of text]\n");
940943
break;
941944
}
942945
}
@@ -949,18 +952,18 @@ int llama_main(
949952
{
950953
const int64_t t_main_end_us = ggml_time_us();
951954

952-
fprintf(stderr, "\n\n");
953-
fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
954-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
955-
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
956-
fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
957-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
955+
fprintf(errstream, "\n\n");
956+
fprintf(errstream, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
957+
fprintf(errstream, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
958+
fprintf(errstream, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
959+
fprintf(errstream, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
960+
fprintf(errstream, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
958961
}
959962

960963
ggml_free(model.ctx);
961964

962965
if (params.use_color) {
963-
printf(ANSI_COLOR_RESET);
966+
fprintf(outstream, ANSI_COLOR_RESET);
964967
}
965968

966969
return 0;

‎llama.h

Copy file name to clipboardExpand all lines: llama.h
+4-1Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,8 @@ int llama_main(
6464
gpt_vocab vocab,
6565
llama_model model,
6666
int64_t t_load_us,
67-
int64_t t_main_start_us);
67+
int64_t t_main_start_us,
68+
FILE *instream,
69+
FILE *outstream,
70+
FILE *errstream);
6871
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx);

‎main.cpp

Copy file name to clipboardExpand all lines: main.cpp
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,5 @@ int main(int argc, char ** argv) {
5656
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
5757
}
5858

59-
return llama_main(params, vocab, model, t_main_start_us, t_load_us);
59+
return llama_main(params, vocab, model, t_main_start_us, t_load_us, stdin, stdout, stderr);
6060
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.