server : add helper function slot.can_speculate()

ggml-ci
ggml-org · ggerganov · Nov 25, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
commit 0ba40c36150e29d1b7893a35e41f806e43b596e3
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -241,6 +241,10 @@ struct server_slot {
        return state != SLOT_STATE_IDLE;
    }

+    bool can_speculate() const {
+        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
+    }
+
    void add_token(const completion_token_output & token) {
        if (!is_processing()) {
            SLT_WRN(*this, "%s", "slot is not processing\n");
@@ -1270,7 +1274,7 @@ struct server_context {
            {"min_keep",                  slot.params.sampling.min_keep},
            {"grammar",                   slot.params.sampling.grammar},
            {"samplers",                  samplers},
-            {"speculative",               slot.params.speculative.model.empty() ? false : true},
+            {"speculative",               slot.can_speculate()},
            {"speculative.n_max",         slot.params.speculative.n_max},
            {"speculative.n_min",         slot.params.speculative.n_min},
            {"speculative.p_min",         slot.params.speculative.p_min},
@@ -2302,11 +2306,10 @@ struct server_context {
                }

                // check if the slot supports speculative decoding
-                if (!slot.ctx_dft || slot.params.speculative.n_max <= 0 || !slot.params.cache_prompt) {
+                if (!slot.can_speculate()) {
                    continue;
                }

-                // TODO: configurable through requests
                struct common_speculative_params params_spec;
                params_spec.n_draft   = slot.params.speculative.n_max;
                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;