Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit cb33f43

Browse filesBrowse files
authored
fix embeddings when using CUDA (#3657)
1 parent e1675d1 commit cb33f43
Copy full SHA for cb33f43

File tree

Expand file treeCollapse file tree

1 file changed

+13
-6
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

1 file changed

+13
-6
lines changed
Open diff view settings
Collapse file

‎llama.cpp‎

Copy file name to clipboardExpand all lines: llama.cpp
+13-6Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5903,6 +5903,13 @@ static int llama_decode_internal(
59035903

59045904
ggml_allocr_alloc_graph(lctx.alloc, gf);
59055905

5906+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5907+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5908+
5909+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5910+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
5911+
5912+
59065913
#ifdef GGML_USE_CUBLAS
59075914
for (int i = 0; i < gf->n_leafs; i++) {
59085915
ggml_tensor * node = gf->leafs[i];
@@ -5920,6 +5927,12 @@ static int llama_decode_internal(
59205927
}
59215928

59225929
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
5930+
5931+
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
5932+
if (!lctx.embedding.empty()) {
5933+
embeddings->backend = GGML_BACKEND_CPU;
5934+
}
5935+
res->backend = GGML_BACKEND_CPU;
59235936
#endif
59245937

59255938
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -5944,12 +5957,6 @@ static int llama_decode_internal(
59445957
n_threads = 1;
59455958
}
59465959

5947-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5948-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5949-
5950-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5951-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
5952-
59535960
#if GGML_USE_MPI
59545961
const int64_t n_layer = hparams.n_layer;
59555962
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.