File tree Expand file tree Collapse file tree 1 file changed +9
-0
lines changed
Filter options
Expand file tree Collapse file tree 1 file changed +9
-0
lines changed
Original file line number Diff line number Diff line change @@ -1057,13 +1057,22 @@ int llama_context::encode(llama_batch & inp_batch) {
1057
1057
ggml_backend_sched_reset (sched.get ());
1058
1058
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
1059
1059
1060
+ const auto causal_attn_org = cparams.causal_attn ;
1061
+
1062
+ // always use non-causal attention for encoder graphs
1063
+ // TODO: this is a tmp solution until we have a proper way to support enc-dec models
1064
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
1065
+ cparams.causal_attn = false ;
1066
+
1060
1067
auto * gf = graph_init ();
1061
1068
auto res = graph_build (ctx_compute.get (), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
1062
1069
1063
1070
ggml_backend_sched_alloc_graph (sched.get (), gf);
1064
1071
1065
1072
res->set_inputs (&ubatch);
1066
1073
1074
+ cparams.causal_attn = causal_attn_org;
1075
+
1067
1076
const auto compute_status = graph_compute (gf, n_tokens > 1 );
1068
1077
switch (compute_status) {
1069
1078
case GGML_STATUS_SUCCESS:
You can’t perform that action at this time.
0 commit comments