ggml-org
diff --git a/‎common/common.cpp
Copy file name to clipboardExpand all lines: common/common.cpp
+8-6Lines changed: 8 additions & 6 deletions b/‎common/common.cpp
Copy file name to clipboardExpand all lines: common/common.cpp
+8-6Lines changed: 8 additions & 6 deletions
diff --git a/‎common/json-schema-to-grammar.h
Copy file name to clipboard
+4Lines changed: 4 additions & 0 deletions b/‎common/json-schema-to-grammar.h
Copy file name to clipboard
+4Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/server/server.cpp
Copy file name to clipboardExpand all lines: examples/server/server.cpp
+52-50Lines changed: 52 additions & 50 deletions b/‎examples/server/server.cpp
Copy file name to clipboardExpand all lines: examples/server/server.cpp
+52-50Lines changed: 52 additions & 50 deletions
@@ -1,4 +1,6 @@
 #include "common.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
@@ -1964,18 +1966,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
             try {
                 metadata_in >> metadata;
                 fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata["url"].is_string()) {
-                    auto previous_url = metadata["url"].get<std::string>();
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
                     if (previous_url != url) {
                         fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
                         return false;
                     }
                 }
-                if (metadata.contains("etag") && metadata["etag"].is_string()) {
-                    etag = metadata["etag"];
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
                 }
-                if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
-                    last_modified = metadata["lastModified"];
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
                 }
             } catch (const nlohmann::json::exception & e) {
                 fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
 
@@ -1,4 +1,8 @@
 #pragma once
+
+#include "ggml.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 
 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
@@ -12,6 +12,8 @@
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 #include "httplib.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 
 // auto generated files (update with ./deps.sh)
@@ -745,7 +747,7 @@ struct server_context {
         }
 
         default_generation_settings_for_props = get_formated_generation(slots.front());
-        default_generation_settings_for_props["seed"] = -1;
+        default_generation_settings_for_props.at("seed") = -1;
 
         // the update_slots() logic will always submit a maximum of n_batch tokens
         // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
@@ -859,7 +861,7 @@ struct server_context {
         slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
 
         // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
+        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
             send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
             return false;
         } else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1343,12 +1345,12 @@ struct server_context {
             }
             slot.n_sent_token_probs = probs_stop_pos;
 
-            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
+            res.data.at("completion_probabilities") = probs_vector_to_json(ctx, probs_output);
         }
 
         if (slot.oaicompat) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
-            res.data["model"] = slot.oaicompat_model;
+            res.data.at("oaicompat_token_ctr") = slot.n_decoded;
+            res.data.at("model") = slot.oaicompat_model;
         }
 
         queue_results.send(res);
@@ -1393,12 +1395,12 @@ struct server_context {
                         slot.generated_token_probs.end());
             }
 
-            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+            res.data.at("completion_probabilities") = probs_vector_to_json(ctx, probs);
         }
 
         if (slot.oaicompat) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
-            res.data["model"] = slot.oaicompat_model;
+            res.data.at("oaicompat_token_ctr") = slot.n_decoded;
+            res.data.at("model") = slot.oaicompat_model;
         }
 
         queue_results.send(res);
@@ -1512,7 +1514,7 @@ struct server_context {
         // add subtasks
         for (int i = 0; i < prompt_count; i++) {
             json subtask_data = multiprompt_task.data;
-            subtask_data["prompt"] = subtask_data["prompt"][i];
+            subtask_data.at("prompt") = subtask_data.at("prompt")[i];
 
             // subtasks inherit everything else (infill mode, embedding mode, etc.)
             request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@@ -1532,7 +1534,7 @@ struct server_context {
                     }
 
                     if (task.data.contains("system_prompt")) {
-                        system_prompt_set(task.data["system_prompt"]);
+                        system_prompt_set(task.data.at("system_prompt"));
 
                         for (server_slot & slot : slots) {
                             slot.n_past    = 0;
@@ -1575,11 +1577,11 @@ struct server_context {
 
                     for (server_slot & slot : slots) {
                         json slot_data = get_formated_generation(slot);
-                        slot_data["id"]         = slot.id;
-                        slot_data["id_task"]    = slot.id_task;
-                        slot_data["state"]      = slot.state;
-                        slot_data["prompt"]     = slot.prompt;
-                        slot_data["next_token"] = {
+                        slot_data.at("id")         = slot.id;
+                        slot_data.at("id_task")    = slot.id_task;
+                        slot_data.at("state")      = slot.state;
+                        slot_data.at("prompt")     = slot.prompt;
+                        slot_data.at("next_token") = {
                             {"has_next_token", slot.has_next_token},
                             {"n_remain",       slot.n_remaining},
                             {"n_decoded",      slot.n_decoded},
@@ -1589,7 +1591,7 @@ struct server_context {
                             {"stopping_word",  slot.stopping_word},
                         };
 
-                        if (slot_data["state"] == SLOT_STATE_IDLE) {
+                        if (slot_data.at("state") == SLOT_STATE_IDLE) {
                             n_idle_slots++;
                         } else {
                             n_processing_slots++;
@@ -1644,7 +1646,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
-                    int id_slot = task.data["id_slot"];
+                    int id_slot = task.data.at("id_slot");
                     server_slot * slot = get_slot(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1654,8 +1656,8 @@ struct server_context {
                     const size_t token_count = slot->cache_tokens.size();
                     const int64_t t_start = ggml_time_us();
 
-                    std::string filename = task.data["filename"];
-                    std::string filepath = task.data["filepath"];
+                    std::string filename = task.data.at("filename");
+                    std::string filepath = task.data.at("filepath");
 
                     const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
 
@@ -1679,7 +1681,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
-                    int id_slot = task.data["id_slot"];
+                    int id_slot = task.data.at("id_slot");
                     server_slot * slot = get_slot(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1688,8 +1690,8 @@ struct server_context {
 
                     const int64_t t_start = ggml_time_us();
 
-                    std::string filename = task.data["filename"];
-                    std::string filepath = task.data["filepath"];
+                    std::string filename = task.data.at("filename");
+                    std::string filepath = task.data.at("filepath");
 
                     slot->cache_tokens.resize(slot->n_ctx);
                     size_t token_count = 0;
@@ -1721,7 +1723,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
                 {
-                    int id_slot = task.data["id_slot"];
+                    int id_slot = task.data.at("id_slot");
                     server_slot * slot = get_slot(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -3136,8 +3138,8 @@ int main(int argc, char ** argv) {
                     server_task_result result = ctx_server.queue_results.recv(task.id);
                     ctx_server.queue_results.remove_waiting_task_id(task.id);
 
-                    const int n_idle_slots       = result.data["idle"];
-                    const int n_processing_slots = result.data["processing"];
+                    const int n_idle_slots       = result.data.at("idle");
+                    const int n_processing_slots = result.data.at("processing");
 
                     json health = {
                         {"status",           "ok"},
@@ -3147,11 +3149,11 @@ int main(int argc, char ** argv) {
 
                     res.status = 200; // HTTP OK
                     if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                        health["slots"] = result.data["slots"];
+                        health.at("slots") = result.data.at("slots");
                     }
 
                     if (n_idle_slots == 0) {
-                        health["status"] = "no slot available";
+                        health.at("status") = "no slot available";
                         if (req.has_param("fail_on_no_slot")) {
                             res.status = 503; // HTTP Service Unavailable
                         }
@@ -3191,7 +3193,7 @@ int main(int argc, char ** argv) {
         server_task_result result = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
 
-        res.set_content(result.data["slots"].dump(), "application/json");
+        res.set_content(result.data.at("slots").dump(), "application/json");
         res.status = 200; // HTTP OK
     };
 
@@ -3218,32 +3220,32 @@ int main(int argc, char ** argv) {
 
         json data = result.data;
 
-        const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
-        const uint64_t t_prompt_processing       = data["t_prompt_processing"];
+        const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
+        const uint64_t t_prompt_processing       = data.at("t_prompt_processing");
 
-        const uint64_t n_tokens_predicted  = data["n_tokens_predicted"];
-        const uint64_t t_tokens_generation = data["t_tokens_generation"];
+        const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted");
+        const uint64_t t_tokens_generation = data.at("t_tokens_generation");
 
-        const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
+        const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
 
         // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
         json all_metrics_def = json {
             {"counter", {{
                     {"name",  "prompt_tokens_total"},
                     {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) data["n_prompt_tokens_processed_total"]}
+                    {"value",  (uint64_t) data.at("n_prompt_tokens_processed_total")}
             }, {
                     {"name",  "prompt_seconds_total"},
                     {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) data["t_prompt_processing_total"] / 1.e3}
+                    {"value",  (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
             }, {
                     {"name",  "tokens_predicted_total"},
                     {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) data["n_tokens_predicted_total"]}
+                    {"value",  (uint64_t) data.at("n_tokens_predicted_total")}
             }, {
                     {"name",  "tokens_predicted_seconds_total"},
                     {"help",  "Predict process time"},
-                    {"value",  (uint64_t) data["t_tokens_generation_total"] / 1.e3}
+                    {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},
@@ -3260,15 +3262,15 @@ int main(int argc, char ** argv) {
             },{
                     {"name",  "kv_cache_tokens"},
                     {"help",  "KV-cache tokens."},
-                    {"value",  (uint64_t) data["kv_cache_tokens_count"]}
+                    {"value",  (uint64_t) data.at("kv_cache_tokens_count")}
             },{
                     {"name",  "requests_processing"},
                     {"help",  "Number of request processing."},
-                    {"value",  (uint64_t) data["processing"]}
+                    {"value",  (uint64_t) data.at("processing")}
             },{
                     {"name",  "requests_deferred"},
                     {"help",  "Number of request deferred."},
-                    {"value",  (uint64_t) data["deferred"]}
+                    {"value",  (uint64_t) data.at("deferred")}
             }}}
         };
 
@@ -3279,8 +3281,8 @@ int main(int argc, char ** argv) {
             const auto & metrics_def = el.value();
 
             for (const auto & metric_def : metrics_def) {
-                const std::string name = metric_def["name"];
-                const std::string help = metric_def["help"];
+                const std::string name = metric_def.at("name");
+                const std::string help = metric_def.at("help");
 
                 auto value = json_value(metric_def, "value", 0.);
                 prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
@@ -3289,7 +3291,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        const int64_t t_start = data["t_start"];
+        const int64_t t_start = data.at("t_start");
         res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
 
         res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@@ -3298,7 +3300,7 @@ int main(int argc, char ** argv) {
 
     const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
         json request_data = json::parse(req.body);
-        std::string filename = request_data["filename"];
+        std::string filename = request_data.at("filename");
         if (!validate_file_name(filename)) {
             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
             return;
@@ -3328,7 +3330,7 @@ int main(int argc, char ** argv) {
 
     const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
         json request_data = json::parse(req.body);
-        std::string filename = request_data["filename"];
+        std::string filename = request_data.at("filename");
         if (!validate_file_name(filename)) {
             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
             return;
@@ -3647,7 +3649,7 @@ int main(int argc, char ** argv) {
 
         std::vector<llama_token> tokens;
         if (body.count("content") != 0) {
-            tokens = ctx_server.tokenize(body["content"], false);
+            tokens = ctx_server.tokenize(body.at("content"), false);
         }
         const json data = format_tokenizer_response(tokens);
         return res.set_content(data.dump(), "application/json; charset=utf-8");
@@ -3659,7 +3661,7 @@ int main(int argc, char ** argv) {
 
         std::string content;
         if (body.count("tokens") != 0) {
-            const std::vector<llama_token> tokens = body["tokens"];
+            const std::vector<llama_token> tokens = body.at("tokens");
             content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
         }
 
@@ -3682,10 +3684,10 @@ int main(int argc, char ** argv) {
         json prompt;
         if (body.count("input") != 0) {
             is_openai = true;
-            prompt = body["input"];
+            prompt = body.at("input");
         } else if (body.count("content") != 0) {
             // with "content", we only support single prompt
-            prompt = std::vector<std::string>{body["content"]};
+            prompt = std::vector<std::string>{body.at("content")};
         } else {
             res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
             return;
@@ -3704,7 +3706,7 @@ int main(int argc, char ** argv) {
             if (!result.error) {
                 if (result.data.count("results")) {
                     // result for multi-task
-                    responses = result.data["results"];
+                    responses = result.data.at("results");
                 } else {
                     // result for single task
                     responses = std::vector<json>{result.data};