ggml-org
diff --git a/‎examples/embedding/embedding.cpp
Copy file name to clipboardExpand all lines: examples/embedding/embedding.cpp
+1-1Lines changed: 1 addition & 1 deletion b/‎examples/embedding/embedding.cpp
Copy file name to clipboardExpand all lines: examples/embedding/embedding.cpp
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/server/server.cpp
Copy file name to clipboardExpand all lines: examples/server/server.cpp
+11-5Lines changed: 11 additions & 5 deletions b/‎examples/server/server.cpp
Copy file name to clipboardExpand all lines: examples/server/server.cpp
+11-5Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/server/utils.hpp
Copy file name to clipboardExpand all lines: examples/server/utils.hpp
+1-1Lines changed: 1 addition & 1 deletion b/‎examples/server/utils.hpp
Copy file name to clipboardExpand all lines: examples/server/utils.hpp
+1-1Lines changed: 1 addition & 1 deletion
diff --git a/‎include/llama.h
Copy file name to clipboardExpand all lines: include/llama.h
+6-5Lines changed: 6 additions & 5 deletions b/‎include/llama.h
Copy file name to clipboardExpand all lines: include/llama.h
+6-5Lines changed: 6 additions & 5 deletions
diff --git a/‎src/llama.cpp
Copy file name to clipboardExpand all lines: src/llama.cpp
+8-2Lines changed: 8 additions & 2 deletions b/‎src/llama.cpp
Copy file name to clipboardExpand all lines: src/llama.cpp
+8-2Lines changed: 8 additions & 2 deletions
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
             }
         } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
             for (int j = 0; j < n_embd_count; j++) {
-                LOG("rank score %d: %8.3f\n", j, emb[j * n_embd]);
+                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
 
@@ -1419,7 +1419,7 @@ struct server_context {
         queue_results.send(res);
     }
 
-    void send_rank(const server_slot & slot, const llama_batch & batch) {
+    void send_rerank(const server_slot & slot, const llama_batch & batch) {
         server_task_result res;
         res.id       = slot.id_task;
         res.error    = false;
@@ -1440,19 +1440,19 @@ struct server_context {
 
                 res.data = json {
                     {"index", slot.index},
-                    {"rank",  -1e6},
+                    {"score", -1e6},
                 };
 
                 continue;
             }
 
             res.data = json {
                 {"index", slot.index},
-                {"rank",  embd[0]},
+                {"score", embd[0]},
             };
         }
 
-        SLT_DBG(slot, "sending rank, res = '%s'\n", res.data.dump().c_str());
+        SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str());
 
         queue_results.send(res);
     }
@@ -1493,6 +1493,9 @@ struct server_context {
         else if (prompt.is_array()) {
             std::vector<json> prompts = prompt;
             if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
+                // prompts[0] is the question
+                // the rest are the answers/documents
+                SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) prompts.size() - 1);
                 for (size_t i = 1; i < prompts.size(); i++) {
                     json qd;
                     qd.push_back(prompts[0]);
@@ -1501,6 +1504,7 @@ struct server_context {
                     create_task(data, true, qd);
                 }
             } else {
+                SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) prompts.size());
                 for (size_t i = 0; i < prompts.size(); i++) {
                     const auto & e = prompts[i];
                     if (e.is_string() || json_is_array_of_numbers(e)) {
@@ -1965,6 +1969,7 @@ struct server_context {
         // track if this is an embedding or non-embedding batch
         // if we've added sampled tokens above, we are in non-embedding mode
         // -1: none, 0: non-embedding, 1: embedding
+        // TODO: make enum
         int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
 
         // next, batch any pending prompts without exceeding n_batch
@@ -2133,6 +2138,7 @@ struct server_context {
                         slot.n_prompt_tokens_processed = 0;
                     }
 
+                    // non-causal tasks require to fit the entire prompt in the physical batch
                     if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
                         // cannot fit the prompt in the current batch - will try next iter
                         if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
@@ -2318,7 +2324,7 @@ struct server_context {
                     }
 
                     if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
-                        send_rank(slot, batch_view);
+                        send_rerank(slot, batch_view);
                         slot.release();
                         slot.i_batch = -1;
                         continue; // continue loop of slots
 
@@ -553,7 +553,7 @@ static json format_response_rerank(const json & request, const json & ranks) {
     for (const auto & rank : ranks) {
         data.push_back(json{
             {"index",    i++},
-            {"relevance_score", json_value(rank, "rank", 0.0)},
+            {"relevance_score", json_value(rank, "score", 0.0)},
         });
     }
 
 
@@ -192,7 +192,7 @@ extern "C" {
         LLAMA_POOLING_TYPE_MEAN = 1,
         LLAMA_POOLING_TYPE_CLS  = 2,
         LLAMA_POOLING_TYPE_LAST = 3,
-        LLAMA_POOLING_TYPE_RANK = 4,
+        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
     };
 
     enum llama_attention_type {
@@ -202,9 +202,9 @@ extern "C" {
     };
 
     enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
+        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
     };
 
     // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -872,7 +872,8 @@ extern "C" {
 
     // Get the embeddings for a sequence id
     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // shape: [n_embd] (1-dimensional)
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
     //
 
@@ -17009,7 +17009,7 @@ static int llama_decode_internal(
                     } break;
                 case LLAMA_POOLING_TYPE_RANK:
                     {
-                        // extract the rank score - a single float per sequence
+                        // extract the rerank score - a single float per sequence
                         auto & embd_seq_out = lctx.embd_seq;
 
                         for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
@@ -17211,7 +17211,6 @@ static int llama_encode_internal(
                 case LLAMA_POOLING_TYPE_MEAN:
                 case LLAMA_POOLING_TYPE_CLS:
                 case LLAMA_POOLING_TYPE_LAST:
-                case LLAMA_POOLING_TYPE_RANK:
                     {
                         // extract sequence embeddings
                         auto & embd_seq_out = lctx.embd_seq;
@@ -17228,6 +17227,13 @@ static int llama_encode_internal(
                             ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
Original file line number	Diff line number	Diff line change
`@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {`
`236`	`236`	`}`
`237`	`237`	`} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {`
`238`	`238`	`for (int j = 0; j < n_embd_count; j++) {`
`239`		`- LOG("rank score %d: %8.3f\n", j, emb[j * n_embd]);`
	`239`	`+ LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);`
`240`	`240`	`}`
`241`	`241`	`} else {`
`242`	`242`	`// print the first part of the embeddings or for a single prompt, the full embedding`
Original file line number	Diff line number	Diff line change
`@@ -553,7 +553,7 @@ static json format_response_rerank(const json & request, const json & ranks) {`
`553`	`553`	`for (const auto & rank : ranks) {`
`554`	`554`	`data.push_back(json{`
`555`	`555`	`{"index", i++},`
`556`		`- {"relevance_score", json_value(rank, "rank", 0.0)},`
	`556`	`+ {"relevance_score", json_value(rank, "score", 0.0)},`
`557`	`557`	`});`
`558`	`558`	`}`
`559`	`559`