From 51d003e88558e260fcfde492bded65d262338c95 Mon Sep 17 00:00:00 2001 From: Thiago Padilha Date: Sat, 18 Mar 2023 11:49:09 -0300 Subject: [PATCH 1/5] Move main.cpp to llama.cpp Signed-off-by: Thiago Padilha --- CMakeLists.txt | 2 +- Makefile | 4 ++-- main.cpp => llama.cpp | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename main.cpp => llama.cpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 38e7266dca630..803e9b273e800 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,7 +107,7 @@ endif() # endif() add_executable(llama - main.cpp + llama.cpp utils.cpp utils.h) diff --git a/Makefile b/Makefile index 1601079a48685..b03c7a083addf 100644 --- a/Makefile +++ b/Makefile @@ -191,8 +191,8 @@ utils.o: utils.cpp utils.h clean: rm -f *.o main quantize -main: main.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS) +main: llama.cpp ggml.o utils.o + $(CXX) $(CXXFLAGS) llama.cpp ggml.o utils.o -o main $(LDFLAGS) ./main -h quantize: quantize.cpp ggml.o utils.o diff --git a/main.cpp b/llama.cpp similarity index 100% rename from main.cpp rename to llama.cpp From 82e70dbfe0f2a6d72a4b10b96708e1e0f447358b Mon Sep 17 00:00:00 2001 From: Thiago Padilha Date: Sat, 18 Mar 2023 11:52:55 -0300 Subject: [PATCH 2/5] Move struct definitions in llama.cpp to llama.h Signed-off-by: Thiago Padilha --- llama.cpp | 51 +---------------------------------------------- llama.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 50 deletions(-) create mode 100644 llama.h diff --git a/llama.cpp b/llama.cpp index c88405b82956a..2450e1c3d1f0b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,3 +1,4 @@ +#include "llama.h" #include "ggml.h" #include "utils.h" @@ -35,56 +36,6 @@ static const std::map LLAMA_N_PARTS = { { 8192, 8 }, }; -// default hparams (LLaMA 7B) -struct llama_hparams { - int32_t n_vocab = 32000; - int32_t n_ctx = 512; // this is provided as user input? - int32_t n_embd = 4096; - int32_t n_mult = 256; - int32_t n_head = 32; - int32_t n_layer = 32; - int32_t n_rot = 64; - int32_t f16 = 1; -}; - -struct llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; -}; - -struct llama_model { - llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - // load the model's weights from a file bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); diff --git a/llama.h b/llama.h new file mode 100644 index 0000000000000..84f4db4081f8e --- /dev/null +++ b/llama.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include +#include +#include + +#include "ggml.h" + + +// default hparams (LLaMA 7B) +struct llama_hparams { + int32_t n_vocab = 32000; + int32_t n_ctx = 512; // this is provided as user input? + int32_t n_embd = 4096; + int32_t n_mult = 256; + int32_t n_head = 32; + int32_t n_layer = 32; + int32_t n_rot = 64; + int32_t f16 = 1; +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct llama_model { + llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; From e3648474d6220c25e7fd1bda92e35e4e20753c92 Mon Sep 17 00:00:00 2001 From: Thiago Padilha Date: Sat, 18 Mar 2023 11:58:11 -0300 Subject: [PATCH 3/5] Add main.cpp back, and invoke llama_main from it Signed-off-by: Thiago Padilha --- CMakeLists.txt | 3 ++- Makefile | 7 +++++-- llama.cpp | 2 +- llama.h | 2 ++ main.cpp | 5 +++++ 5 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 803e9b273e800..09ac18b1af816 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,8 +107,9 @@ endif() # endif() add_executable(llama - llama.cpp + main.cpp utils.cpp + llama.cpp utils.h) add_executable(quantize diff --git a/Makefile b/Makefile index b03c7a083addf..b7abaa9a029c5 100644 --- a/Makefile +++ b/Makefile @@ -188,11 +188,14 @@ ggml.o: ggml.c ggml.h utils.o: utils.cpp utils.h $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o +llama.o: llama.cpp llama.h + $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o + clean: rm -f *.o main quantize -main: llama.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) llama.cpp ggml.o utils.o -o main $(LDFLAGS) +main: main.cpp ggml.o utils.o llama.o + $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o llama.o -o main $(LDFLAGS) ./main -h quantize: quantize.cpp ggml.o utils.o diff --git a/llama.cpp b/llama.cpp index 2450e1c3d1f0b..21369cea2788a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -733,7 +733,7 @@ const char * llama_print_system_info(void) { return s.c_str(); } -int main(int argc, char ** argv) { +int llama_main(int argc, char ** argv) { ggml_time_init(); const int64_t t_main_start_us = ggml_time_us(); diff --git a/llama.h b/llama.h index 84f4db4081f8e..ea71c7402302b 100644 --- a/llama.h +++ b/llama.h @@ -57,3 +57,5 @@ struct llama_model { struct ggml_context * ctx; std::map tensors; }; + +int llama_main(int argc, char ** argv); diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000000000..8b9a3ff50a66d --- /dev/null +++ b/main.cpp @@ -0,0 +1,5 @@ +#include "llama.h" + +int main(int argc, char ** argv) { + return llama_main(argc, argv); +} From 1088d2dd04761968073afe17ba4824e6c1c94703 Mon Sep 17 00:00:00 2001 From: Thiago Padilha Date: Sat, 18 Mar 2023 12:12:00 -0300 Subject: [PATCH 4/5] Move model loading back to main.cpp Signed-off-by: Thiago Padilha --- llama.cpp | 59 ++++++------------------------------------------------- llama.h | 9 ++++++++- main.cpp | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 55 deletions(-) diff --git a/llama.cpp b/llama.cpp index 21369cea2788a..35ec3e1401dce 100644 --- a/llama.cpp +++ b/llama.cpp @@ -713,36 +713,12 @@ void sigint_handler(int signo) { } #endif -const char * llama_print_system_info(void) { - static std::string s; - - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - - return s.c_str(); -} - -int llama_main(int argc, char ** argv) { - ggml_time_init(); - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/llama-7B/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } +int llama_main( + gpt_params params, + gpt_vocab vocab, + llama_model model, + int64_t t_load_us, + int64_t t_main_start_us) { if (params.seed < 0) { params.seed = time(NULL); @@ -758,29 +734,6 @@ int llama_main(int argc, char ** argv) { // params.prompt = R"(// this function checks if the number n is prime //bool is_prime(int n) {)"; - int64_t t_load_us = 0; - - gpt_vocab vocab; - llama_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(params.model, model, vocab, params.n_ctx)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); - } - int n_past = 0; int64_t t_sample_us = 0; diff --git a/llama.h b/llama.h index ea71c7402302b..9cacb613c71f8 100644 --- a/llama.h +++ b/llama.h @@ -6,6 +6,7 @@ #include #include "ggml.h" +#include "utils.h" // default hparams (LLaMA 7B) @@ -58,4 +59,10 @@ struct llama_model { std::map tensors; }; -int llama_main(int argc, char ** argv); +int llama_main( + gpt_params params, + gpt_vocab vocab, + llama_model model, + int64_t t_load_us, + int64_t t_main_start_us); +bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx); diff --git a/main.cpp b/main.cpp index 8b9a3ff50a66d..7106a8e1978e1 100644 --- a/main.cpp +++ b/main.cpp @@ -1,5 +1,60 @@ +#include "ggml.h" +#include "utils.h" #include "llama.h" +const char * llama_print_system_info(void) { + static std::string s; + + s = ""; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + + return s.c_str(); +} + int main(int argc, char ** argv) { - return llama_main(argc, argv); + ggml_time_init(); + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + llama_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + if (!llama_model_load(params.model, model, vocab, params.n_ctx)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + return llama_main(params, vocab, model, t_main_start_us, t_load_us); } From edc17cfa9fe96fb6e0f884ef763044af67abeb8c Mon Sep 17 00:00:00 2001 From: Thiago Padilha Date: Sat, 18 Mar 2023 12:20:20 -0300 Subject: [PATCH 5/5] Remove direct access to std streams from llama_main The goal is to allow running llama_main while connected to other streams, such as TCP sockets. Signed-off-by: Thiago Padilha --- llama.cpp | 69 +++++++++++++++++++++++++++++-------------------------- llama.h | 5 +++- main.cpp | 2 +- 3 files changed, 41 insertions(+), 35 deletions(-) diff --git a/llama.cpp b/llama.cpp index 35ec3e1401dce..05e37a0d6e044 100644 --- a/llama.cpp +++ b/llama.cpp @@ -718,13 +718,16 @@ int llama_main( gpt_vocab vocab, llama_model model, int64_t t_load_us, - int64_t t_main_start_us) { + int64_t t_main_start_us, + FILE *instream, + FILE *outstream, + FILE *errstream) { if (params.seed < 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(errstream, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.prompt.empty()) { @@ -751,13 +754,13 @@ int llama_main( // tokenize the reverse prompt std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + fprintf(errstream, "\n"); + fprintf(errstream, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(errstream, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); + fprintf(errstream, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); } - fprintf(stderr, "\n"); + fprintf(errstream, "\n"); if (params.interactive) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; @@ -769,19 +772,19 @@ int llama_main( signal(SIGINT, sigint_handler); #endif - fprintf(stderr, "%s: interactive mode on.\n", __func__); + fprintf(errstream, "%s: interactive mode on.\n", __func__); if(antiprompt_inp.size()) { - fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); - fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); + fprintf(errstream, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); + fprintf(errstream, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); for (int i = 0; i < (int) antiprompt_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); + fprintf(errstream, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); } - fprintf(stderr, "\n"); + fprintf(errstream, "\n"); } } - fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); - fprintf(stderr, "\n\n"); + fprintf(errstream, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(errstream, "\n\n"); std::vector embd; @@ -795,7 +798,7 @@ int llama_main( if (params.interactive) { - fprintf(stderr, "== Running in interactive mode. ==\n" + fprintf(errstream, "== Running in interactive mode. ==\n" #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) " - Press Ctrl+C to interject at any time.\n" #endif @@ -814,7 +817,7 @@ int llama_main( // set the color for the prompt which will be output initially if (params.use_color) { - printf(ANSI_COLOR_YELLOW); + fprintf(outstream, ANSI_COLOR_YELLOW); } while (remaining_tokens > 0) { @@ -823,7 +826,7 @@ int llama_main( const int64_t t_start_us = ggml_time_us(); if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - fprintf(stderr, "Failed to predict\n"); + fprintf(errstream, "Failed to predict\n"); return 1; } @@ -877,16 +880,16 @@ int llama_main( // reset color to default if we there is no pending user input if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) { - printf(ANSI_COLOR_RESET); + fprintf(outstream, ANSI_COLOR_RESET); } } // display text if (!input_noecho) { for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); + fprintf(outstream, "%s", vocab.id_to_token[id].c_str()); } - fflush(stdout); + fflush(outstream); } // in interactive mode, and not currently processing queued inputs; @@ -901,16 +904,16 @@ int llama_main( // currently being interactive bool another_line=true; while (another_line) { - fflush(stdout); + fflush(outstream); char buf[256] = {0}; int n_read; - if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); - if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { + if(params.use_color) fprintf(outstream, ANSI_BOLD ANSI_COLOR_GREEN); + if (fscanf(instream, "%255[^\n]%n%*c", buf, &n_read) <= 0) { // presumable empty line, consume the newline - std::ignore = scanf("%*c"); + std::ignore = fscanf(instream, "%*c"); n_read=0; } - if(params.use_color) printf(ANSI_COLOR_RESET); + if(params.use_color) fprintf(outstream, ANSI_COLOR_RESET); if (n_read > 0 && buf[n_read-1]=='\\') { another_line = true; @@ -936,7 +939,7 @@ int llama_main( // end of text token if (embd.back() == 2) { - fprintf(stderr, " [end of text]\n"); + fprintf(errstream, " [end of text]\n"); break; } } @@ -949,18 +952,18 @@ int llama_main( { const int64_t t_main_end_us = ggml_time_us(); - fprintf(stderr, "\n\n"); - fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + fprintf(errstream, "\n\n"); + fprintf(errstream, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + fprintf(errstream, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + fprintf(errstream, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + fprintf(errstream, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + fprintf(errstream, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } ggml_free(model.ctx); if (params.use_color) { - printf(ANSI_COLOR_RESET); + fprintf(outstream, ANSI_COLOR_RESET); } return 0; diff --git a/llama.h b/llama.h index 9cacb613c71f8..7c8409d1a158e 100644 --- a/llama.h +++ b/llama.h @@ -64,5 +64,8 @@ int llama_main( gpt_vocab vocab, llama_model model, int64_t t_load_us, - int64_t t_main_start_us); + int64_t t_main_start_us, + FILE *instream, + FILE *outstream, + FILE *errstream); bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx); diff --git a/main.cpp b/main.cpp index 7106a8e1978e1..e3fc73e750a21 100644 --- a/main.cpp +++ b/main.cpp @@ -56,5 +56,5 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } - return llama_main(params, vocab, model, t_main_start_us, t_load_us); + return llama_main(params, vocab, model, t_main_start_us, t_load_us, stdin, stdout, stderr); }