File tree Expand file tree Collapse file tree 2 files changed +5
-5
lines changed
Filter options
Expand file tree Collapse file tree 2 files changed +5
-5
lines changed
Original file line number Diff line number Diff line change @@ -7552,9 +7552,9 @@ static __global__ void flash_attn_ext_f16(
7552
7552
__builtin_assume (tid < nthreads);
7553
7553
constexpr int D_padded = D + 8 ; // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts.
7554
7554
7555
- const float * Q_f = (const float *) (Q + nb02*blockIdx .y + ncols*nb01*blockIdx .x );
7556
- const half * K_h = (const half *) (K + nb12*blockIdx .y );
7557
- const half * V_h = (const half *) (V + nb12*blockIdx .y ); // K and V have same shape
7555
+ const float * Q_f = (const float *) (Q + nb02* blockIdx .y + ncols*nb01*blockIdx .x );
7556
+ const half * K_h = (const half *) (K + nb12*( blockIdx .y % ne12) );
7557
+ const half * V_h = (const half *) (V + nb12*( blockIdx .y % ne12) ); // K and V have same shape
7558
7558
const half2 * mask2 = (half2 *) mask + ncols*ne11*blockIdx .x /2 ;
7559
7559
7560
7560
const int stride_Q = nb01 / sizeof (float );
Original file line number Diff line number Diff line change @@ -9166,7 +9166,7 @@ static int llama_decode_internal(
9166
9166
// a heuristic, to avoid attending the full cache if it is not yet utilized
9167
9167
// after enough generations, the benefit from this heuristic disappears
9168
9168
// if we start defragmenting the cache, the benefit from this will be more important
9169
- kv_self.n = std::min(kv_self.size, std::max(128u , GGML_PAD(llama_kv_cache_cell_max(kv_self), 128 )));
9169
+ kv_self.n = std::min(kv_self.size, std::max(256u , GGML_PAD(llama_kv_cache_cell_max(kv_self), 256 )));
9170
9170
//kv_self.n = llama_kv_cache_cell_max(kv_self);
9171
9171
}
9172
9172
}
@@ -13083,7 +13083,7 @@ struct llama_context * llama_new_context_with_model(
13083
13083
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
13084
13084
13085
13085
// this is necessary due to kv_self.n being padded later during inference
13086
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32 );
13086
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256 );
13087
13087
13088
13088
// with causal attention, the batch size is limited by the context size
13089
13089
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
You can’t perform that action at this time.
0 commit comments