From 51d003e88558e260fcfde492bded65d262338c95 Mon Sep 17 00:00:00 2001
From: Thiago Padilha <thiago@padilha.cc>
Date: Sat, 18 Mar 2023 11:49:09 -0300
Subject: [PATCH 1/5] Move main.cpp to llama.cpp

Signed-off-by: Thiago Padilha <thiago@padilha.cc>
---
 CMakeLists.txt        | 2 +-
 Makefile              | 4 ++--
 main.cpp => llama.cpp | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename main.cpp => llama.cpp (100%)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 38e7266dca630..803e9b273e800 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,7 +107,7 @@ endif()
 # endif()
 
 add_executable(llama
-    main.cpp
+    llama.cpp
     utils.cpp
     utils.h)
 
diff --git a/Makefile b/Makefile
index 1601079a48685..b03c7a083addf 100644
--- a/Makefile
+++ b/Makefile
@@ -191,8 +191,8 @@ utils.o: utils.cpp utils.h
 clean:
 	rm -f *.o main quantize
 
-main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
+main: llama.cpp ggml.o utils.o
+	$(CXX) $(CXXFLAGS) llama.cpp ggml.o utils.o -o main $(LDFLAGS)
 	./main -h
 
 quantize: quantize.cpp ggml.o utils.o
diff --git a/main.cpp b/llama.cpp
similarity index 100%
rename from main.cpp
rename to llama.cpp

From 82e70dbfe0f2a6d72a4b10b96708e1e0f447358b Mon Sep 17 00:00:00 2001
From: Thiago Padilha <thiago@padilha.cc>
Date: Sat, 18 Mar 2023 11:52:55 -0300
Subject: [PATCH 2/5] Move struct definitions in llama.cpp to llama.h

Signed-off-by: Thiago Padilha <thiago@padilha.cc>
---
 llama.cpp | 51 +----------------------------------------------
 llama.h   | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 50 deletions(-)
 create mode 100644 llama.h

diff --git a/llama.cpp b/llama.cpp
index c88405b82956a..2450e1c3d1f0b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1,3 +1,4 @@
+#include "llama.h"
 #include "ggml.h"
 
 #include "utils.h"
@@ -35,56 +36,6 @@ static const std::map<int, int> LLAMA_N_PARTS = {
     { 8192, 8 },
 };
 
-// default hparams (LLaMA 7B)
-struct llama_hparams {
-    int32_t n_vocab = 32000;
-    int32_t n_ctx   = 512;   // this is provided as user input?
-    int32_t n_embd  = 4096;
-    int32_t n_mult  = 256;
-    int32_t n_head  = 32;
-    int32_t n_layer = 32;
-    int32_t n_rot   = 64;
-    int32_t f16     = 1;
-};
-
-struct llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-struct llama_model {
-    llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<llama_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
diff --git a/llama.h b/llama.h
new file mode 100644
index 0000000000000..84f4db4081f8e
--- /dev/null
+++ b/llama.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <vector>
+#include <map>
+#include <cstdio>
+#include <string>
+
+#include "ggml.h"
+
+
+// default hparams (LLaMA 7B)
+struct llama_hparams {
+    int32_t n_vocab = 32000;
+    int32_t n_ctx   = 512;   // this is provided as user input?
+    int32_t n_embd  = 4096;
+    int32_t n_mult  = 256;
+    int32_t n_head  = 32;
+    int32_t n_layer = 32;
+    int32_t n_rot   = 64;
+    int32_t f16     = 1;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct llama_model {
+    llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<llama_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};

From e3648474d6220c25e7fd1bda92e35e4e20753c92 Mon Sep 17 00:00:00 2001
From: Thiago Padilha <thiago@padilha.cc>
Date: Sat, 18 Mar 2023 11:58:11 -0300
Subject: [PATCH 3/5] Add main.cpp back, and invoke llama_main from it

Signed-off-by: Thiago Padilha <thiago@padilha.cc>
---
 CMakeLists.txt | 3 ++-
 Makefile       | 7 +++++--
 llama.cpp      | 2 +-
 llama.h        | 2 ++
 main.cpp       | 5 +++++
 5 files changed, 15 insertions(+), 4 deletions(-)
 create mode 100644 main.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803e9b273e800..09ac18b1af816 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,8 +107,9 @@ endif()
 # endif()
 
 add_executable(llama
-    llama.cpp
+    main.cpp
     utils.cpp
+    llama.cpp
     utils.h)
 
 add_executable(quantize
diff --git a/Makefile b/Makefile
index b03c7a083addf..b7abaa9a029c5 100644
--- a/Makefile
+++ b/Makefile
@@ -188,11 +188,14 @@ ggml.o: ggml.c ggml.h
 utils.o: utils.cpp utils.h
 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
 
+llama.o: llama.cpp llama.h
+	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
+
 clean:
 	rm -f *.o main quantize
 
-main: llama.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) llama.cpp ggml.o utils.o -o main $(LDFLAGS)
+main: main.cpp ggml.o utils.o llama.o
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o llama.o -o main $(LDFLAGS)
 	./main -h
 
 quantize: quantize.cpp ggml.o utils.o
diff --git a/llama.cpp b/llama.cpp
index 2450e1c3d1f0b..21369cea2788a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -733,7 +733,7 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-int main(int argc, char ** argv) {
+int llama_main(int argc, char ** argv) {
     ggml_time_init();
     const int64_t t_main_start_us = ggml_time_us();
 
diff --git a/llama.h b/llama.h
index 84f4db4081f8e..ea71c7402302b 100644
--- a/llama.h
+++ b/llama.h
@@ -57,3 +57,5 @@ struct llama_model {
     struct ggml_context * ctx;
     std::map<std::string, struct ggml_tensor *> tensors;
 };
+
+int llama_main(int argc, char ** argv);
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000000000..8b9a3ff50a66d
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,5 @@
+#include "llama.h"
+
+int main(int argc, char ** argv) {
+    return llama_main(argc, argv);
+}

From 1088d2dd04761968073afe17ba4824e6c1c94703 Mon Sep 17 00:00:00 2001
From: Thiago Padilha <thiago@padilha.cc>
Date: Sat, 18 Mar 2023 12:12:00 -0300
Subject: [PATCH 4/5] Move model loading back to main.cpp

Signed-off-by: Thiago Padilha <thiago@padilha.cc>
---
 llama.cpp | 59 ++++++-------------------------------------------------
 llama.h   |  9 ++++++++-
 main.cpp  | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 21369cea2788a..35ec3e1401dce 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -713,36 +713,12 @@ void sigint_handler(int signo) {
 }
 #endif
 
-const char * llama_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
-
-    return s.c_str();
-}
-
-int llama_main(int argc, char ** argv) {
-    ggml_time_init();
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
+int llama_main(
+    gpt_params params,
+    gpt_vocab vocab,
+    llama_model model,
+    int64_t t_load_us,
+    int64_t t_main_start_us) {
 
     if (params.seed < 0) {
         params.seed = time(NULL);
@@ -758,29 +734,6 @@ int llama_main(int argc, char ** argv) {
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";
 
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    llama_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-        if (!llama_model_load(params.model, model, vocab, params.n_ctx)) {  
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
     int n_past = 0;
 
     int64_t t_sample_us  = 0;
diff --git a/llama.h b/llama.h
index ea71c7402302b..9cacb613c71f8 100644
--- a/llama.h
+++ b/llama.h
@@ -6,6 +6,7 @@
 #include <string>
 
 #include "ggml.h"
+#include "utils.h"
 
 
 // default hparams (LLaMA 7B)
@@ -58,4 +59,10 @@ struct llama_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
-int llama_main(int argc, char ** argv);
+int llama_main(
+    gpt_params params,
+    gpt_vocab vocab,
+    llama_model model,
+    int64_t t_load_us,
+    int64_t t_main_start_us);
+bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx);
diff --git a/main.cpp b/main.cpp
index 8b9a3ff50a66d..7106a8e1978e1 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,5 +1,60 @@
+#include "ggml.h"
+#include "utils.h"
 #include "llama.h"
 
+const char * llama_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
+    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
+    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+
+    return s.c_str();
+}
+
 int main(int argc, char ** argv) {
-    return llama_main(argc, argv);
+    ggml_time_init();
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/llama-7B/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    llama_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+        if (!llama_model_load(params.model, model, vocab, params.n_ctx)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
+    return llama_main(params, vocab, model, t_main_start_us, t_load_us);
 }

From edc17cfa9fe96fb6e0f884ef763044af67abeb8c Mon Sep 17 00:00:00 2001
From: Thiago Padilha <thiago@padilha.cc>
Date: Sat, 18 Mar 2023 12:20:20 -0300
Subject: [PATCH 5/5] Remove direct access to std streams from llama_main

The goal is to allow running llama_main while connected to other
streams, such as TCP sockets.

Signed-off-by: Thiago Padilha <thiago@padilha.cc>
---
 llama.cpp | 69 +++++++++++++++++++++++++++++--------------------------
 llama.h   |  5 +++-
 main.cpp  |  2 +-
 3 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 35ec3e1401dce..05e37a0d6e044 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -718,13 +718,16 @@ int llama_main(
     gpt_vocab vocab,
     llama_model model,
     int64_t t_load_us,
-    int64_t t_main_start_us) {
+    int64_t t_main_start_us,
+    FILE *instream,
+    FILE *outstream,
+    FILE *errstream) {
 
     if (params.seed < 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(errstream, "%s: seed = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.prompt.empty()) {
@@ -751,13 +754,13 @@ int llama_main(
     // tokenize the reverse prompt
     std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
 
-    fprintf(stderr, "\n");
-    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    fprintf(errstream, "\n");
+    fprintf(errstream, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    fprintf(errstream, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
     for (int i = 0; i < (int) embd_inp.size(); i++) {
-        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+        fprintf(errstream, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
     }
-    fprintf(stderr, "\n");
+    fprintf(errstream, "\n");
     if (params.interactive) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
         struct sigaction sigint_action;
@@ -769,19 +772,19 @@ int llama_main(
         signal(SIGINT, sigint_handler);
 #endif
 
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        fprintf(errstream, "%s: interactive mode on.\n", __func__);
 
         if(antiprompt_inp.size()) {
-            fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
-            fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
+            fprintf(errstream, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
+            fprintf(errstream, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
             for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
+                fprintf(errstream, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
             }
-            fprintf(stderr, "\n");
+            fprintf(errstream, "\n");
         }
     }
-    fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    fprintf(stderr, "\n\n");
+    fprintf(errstream, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(errstream, "\n\n");
 
     std::vector<gpt_vocab::id> embd;
 
@@ -795,7 +798,7 @@ int llama_main(
 
 
     if (params.interactive) {
-        fprintf(stderr, "== Running in interactive mode. ==\n"
+        fprintf(errstream, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
                " - Press Ctrl+C to interject at any time.\n"
 #endif
@@ -814,7 +817,7 @@ int llama_main(
 
     // set the color for the prompt which will be output initially
     if (params.use_color) {
-        printf(ANSI_COLOR_YELLOW);
+        fprintf(outstream, ANSI_COLOR_YELLOW);
     }
 
     while (remaining_tokens > 0) {
@@ -823,7 +826,7 @@ int llama_main(
             const int64_t t_start_us = ggml_time_us();
 
             if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                fprintf(stderr, "Failed to predict\n");
+                fprintf(errstream, "Failed to predict\n");
                 return 1;
             }
 
@@ -877,16 +880,16 @@ int llama_main(
 
             // reset color to default if we there is no pending user input
             if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
-                printf(ANSI_COLOR_RESET);
+                fprintf(outstream, ANSI_COLOR_RESET);
             }
         }
 
         // display text
         if (!input_noecho) {
             for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
+                fprintf(outstream, "%s", vocab.id_to_token[id].c_str());
             }
-            fflush(stdout);
+            fflush(outstream);
         }
 
         // in interactive mode, and not currently processing queued inputs;
@@ -901,16 +904,16 @@ int llama_main(
                 // currently being interactive
                 bool another_line=true;
                 while (another_line) {
-                    fflush(stdout);
+                    fflush(outstream);
                     char buf[256] = {0};
                     int n_read;
-                    if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
-                    if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
+                    if(params.use_color) fprintf(outstream, ANSI_BOLD ANSI_COLOR_GREEN);
+                    if (fscanf(instream, "%255[^\n]%n%*c", buf, &n_read) <= 0) {
                         // presumable empty line, consume the newline
-                        std::ignore = scanf("%*c");
+                        std::ignore = fscanf(instream, "%*c");
                         n_read=0;
                     }
-                    if(params.use_color) printf(ANSI_COLOR_RESET);
+                    if(params.use_color) fprintf(outstream, ANSI_COLOR_RESET);
 
                     if (n_read > 0 && buf[n_read-1]=='\\') {
                         another_line = true;
@@ -936,7 +939,7 @@ int llama_main(
 
         // end of text token
         if (embd.back() == 2) {
-            fprintf(stderr, " [end of text]\n");
+            fprintf(errstream, " [end of text]\n");
             break;
         }
     }
@@ -949,18 +952,18 @@ int llama_main(
     {
         const int64_t t_main_end_us = ggml_time_us();
 
-        fprintf(stderr, "\n\n");
-        fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        fprintf(stderr, "%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+        fprintf(errstream, "\n\n");
+        fprintf(errstream, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        fprintf(errstream, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        fprintf(errstream, "%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        fprintf(errstream, "%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        fprintf(errstream, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
     }
 
     ggml_free(model.ctx);
 
     if (params.use_color) {
-        printf(ANSI_COLOR_RESET);
+        fprintf(outstream, ANSI_COLOR_RESET);
     }
 
     return 0;
diff --git a/llama.h b/llama.h
index 9cacb613c71f8..7c8409d1a158e 100644
--- a/llama.h
+++ b/llama.h
@@ -64,5 +64,8 @@ int llama_main(
     gpt_vocab vocab,
     llama_model model,
     int64_t t_load_us,
-    int64_t t_main_start_us);
+    int64_t t_main_start_us,
+    FILE *instream,
+    FILE *outstream,
+    FILE *errstream);
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx);
diff --git a/main.cpp b/main.cpp
index 7106a8e1978e1..e3fc73e750a21 100644
--- a/main.cpp
+++ b/main.cpp
@@ -56,5 +56,5 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    return llama_main(params, vocab, model, t_main_start_us, t_load_us);
+    return llama_main(params, vocab, model, t_main_start_us, t_load_us, stdin, stdout, stderr);
 }