Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 03562f3

Browse filesBrowse files
authored
llama : support attention bias on LLaMA architecture (#4283)
* Support attention_bias on LLaMA architecture QKVO bias, should fix InternLM (#3133) and works for LLaMAfied Qwen models (#3743 (comment)). * check existence of qkvo bias while loading llama models Tested on LLaMA2, CUDA and CPU. * Update llama.cpp
1 parent 37c746d commit 03562f3
Copy full SHA for 03562f3

File tree

Expand file treeCollapse file tree

1 file changed

+48
-4
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+48
-4
lines changed

‎llama.cpp

Copy file name to clipboardExpand all lines: llama.cpp
+48-4Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,6 +1266,9 @@ struct llama_layer {
12661266
struct ggml_tensor * wqkv;
12671267

12681268
// attention bias
1269+
struct ggml_tensor * bq;
1270+
struct ggml_tensor * bk;
1271+
struct ggml_tensor * bv;
12691272
struct ggml_tensor * bo;
12701273
struct ggml_tensor * bqkv;
12711274

@@ -2809,6 +2812,30 @@ static void llm_load_tensors(
28092812
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
28102813
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
28112814

2815+
try {
2816+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
2817+
} catch (const std::runtime_error& e) {
2818+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw;
2819+
}
2820+
2821+
try {
2822+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
2823+
} catch (const std::runtime_error& e) {
2824+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw;
2825+
}
2826+
2827+
try {
2828+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
2829+
} catch (const std::runtime_error& e) {
2830+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw;
2831+
}
2832+
2833+
try {
2834+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
2835+
} catch (const std::runtime_error& e) {
2836+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw;
2837+
}
2838+
28122839
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
28132840

28142841
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
@@ -2817,9 +2844,14 @@ static void llm_load_tensors(
28172844

28182845
if (backend == GGML_BACKEND_GPU) {
28192846
vram_weights +=
2820-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2821-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2822-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
2847+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2848+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
2849+
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
2850+
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
2851+
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
2852+
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
2853+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
2854+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
28232855
}
28242856
}
28252857
} break;
@@ -3983,12 +4015,24 @@ struct llm_build_context {
39834015
// compute Q and K and RoPE them
39844016
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
39854017
cb(Qcur, "Qcur", il);
4018+
if (model.layers[il].bq) {
4019+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4020+
cb(Qcur, "Qcur", il);
4021+
}
39864022

39874023
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
39884024
cb(Kcur, "Kcur", il);
4025+
if (model.layers[il].bk) {
4026+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4027+
cb(Kcur, "Kcur", il);
4028+
}
39894029

39904030
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
39914031
cb(Vcur, "Vcur", il);
4032+
if (model.layers[il].bv) {
4033+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4034+
cb(Vcur, "Vcur", il);
4035+
}
39924036

39934037
Qcur = ggml_rope_custom(
39944038
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -4007,7 +4051,7 @@ struct llm_build_context {
40074051
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
40084052

40094053
cur = llm_build_kqv(ctx0, hparams, kv_self,
4010-
model.layers[il].wo, NULL,
4054+
model.layers[il].wo, model.layers[il].bo,
40114055
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
40124056
cb(cur, "kqv_out", il);
40134057
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.