ggml-org
diff --git a/‎ggml-cuda.cu
Copy file name to clipboardExpand all lines: ggml-cuda.cu
+3-3Lines changed: 3 additions & 3 deletions b/‎ggml-cuda.cu
Copy file name to clipboardExpand all lines: ggml-cuda.cu
+3-3Lines changed: 3 additions & 3 deletions
diff --git a/‎llama.cpp
Copy file name to clipboardExpand all lines: llama.cpp
+2-2Lines changed: 2 additions & 2 deletions b/‎llama.cpp
Copy file name to clipboardExpand all lines: llama.cpp
+2-2Lines changed: 2 additions & 2 deletions
@@ -7552,9 +7552,9 @@ static __global__ void flash_attn_ext_f16(
     __builtin_assume(tid < nthreads);
     constexpr int D_padded = D + 8; // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts.
 
-    const float * Q_f   = (const float *) (Q + nb02*blockIdx.y + ncols*nb01*blockIdx.x);
-    const half  * K_h   = (const half  *) (K + nb12*blockIdx.y);
-    const half  * V_h   = (const half  *) (V + nb12*blockIdx.y); // K and V have same shape
+    const float * Q_f   = (const float *) (Q + nb02* blockIdx.y           + ncols*nb01*blockIdx.x);
+    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.y % ne12));
+    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.y % ne12)); // K and V have same shape
     const half2 * mask2 = (half2       *)  mask + ncols*ne11*blockIdx.x/2;
 
     const int stride_Q  = nb01 / sizeof(float);
 
@@ -9166,7 +9166,7 @@ static int llama_decode_internal(
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                kv_self.n = std::min(kv_self.size, std::max(128u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 128)));
+                kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
                 //kv_self.n = llama_kv_cache_cell_max(kv_self);
             }
         }
@@ -13083,7 +13083,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
+    cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;