41
41
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
42
42
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
43
43
44
- #define GGML_CUDA_CC_PASCAL 600
45
- #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
- #define GGML_CUDA_CC_VOLTA 700
47
- #define GGML_CUDA_CC_TURING 750
48
- #define GGML_CUDA_CC_AMPERE 800
49
- #define GGML_CUDA_CC_ADA_LOVELACE 890
50
- #define GGML_CUDA_CC_OFFSET_AMD 0x1000000
51
-
44
+ #define GGML_CUDA_CC_PASCAL 600
45
+ #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46
+ #define GGML_CUDA_CC_VOLTA 700
47
+ #define GGML_CUDA_CC_TURING 750
48
+ #define GGML_CUDA_CC_AMPERE 800
49
+ #define GGML_CUDA_CC_ADA_LOVELACE 890
50
+ #define GGML_CUDA_CC_OFFSET_AMD 0x1000000
51
+ #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
52
+ #define GGML_CUDA_CC_IS_NVIDIA (cc ) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
53
+
54
+ // AMD
52
55
// GCN/CNDA, wave size is 64
53
56
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803 ) // Tonga, Fiji, Polaris, minimum for fast fp16
54
57
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900 ) // Vega56/64, minimum for fp16 dual issue
70
73
#define GGML_CUDA_CC_IS_GCN (cc ) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
71
74
#define GGML_CUDA_CC_IS_CDNA (cc ) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
72
75
73
- #define GGML_CUDA_CC_QY1 210
74
- #define GGML_CUDA_CC_QY2 220
76
+ // Moore Threads
77
+ #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210 )
78
+
79
+ #define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210 ) // MTT S80, MTT S3000
80
+ #define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220 ) // MTT S4000
81
+ #define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310 ) // TBD
82
+
83
+ #define GGML_CUDA_CC_IS_MTHREADS (cc ) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
84
+ #define GGML_CUDA_CC_IS_QY1 (cc ) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
85
+ #define GGML_CUDA_CC_IS_QY2 (cc ) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
86
+ #define GGML_CUDA_CC_IS_NG (cc ) (cc >= GGML_CUDA_CC_NG)
75
87
76
88
#ifdef __CUDA_ARCH_LIST__
77
89
constexpr bool ggml_cuda_has_arch_impl (int ) {
@@ -209,42 +221,42 @@ typedef float2 dfloat2;
209
221
#define CP_ASYNC_AVAILABLE
210
222
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
211
223
212
- #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
224
+ #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1 )
213
225
#define FLASH_ATTN_AVAILABLE
214
- #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1 )
226
+ #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1 )
215
227
216
228
static bool fp16_available (const int cc) {
217
229
return ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_PASCAL;
218
230
}
219
231
220
232
static bool fast_fp16_available (const int cc) {
221
- return fp16_available (cc) && cc != 610 ;
233
+ return ( GGML_CUDA_CC_IS_NVIDIA (cc) && fp16_available (cc) && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc) ;
222
234
}
223
235
224
236
// To be used for feature selection of external libraries, e.g. cuBLAS.
225
237
static bool fast_fp16_hardware_available (const int cc) {
226
- return cc >= GGML_CUDA_CC_PASCAL && cc != 610 ;
238
+ return ( GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 ) || GGML_CUDA_CC_IS_AMD (cc) ;
227
239
}
228
240
229
241
// Any FP16 tensor core instructions are available for ggml code.
230
242
static bool fp16_mma_available (const int cc) {
231
243
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
232
244
return false ;
233
245
#else
234
- return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA ||
235
- GGML_CUDA_CC_IS_CDNA (cc) || cc >= GGML_CUDA_CC_RDNA3 ;
246
+ return GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA ||
247
+ GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) ;
236
248
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
237
249
}
238
250
239
251
// To be used for feature selection of external libraries, e.g. cuBLAS.
240
252
static bool fp16_mma_hardware_available (const int cc) {
241
- return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
242
- GGML_CUDA_CC_IS_CDNA (cc) || cc >= GGML_CUDA_CC_RDNA3 ;
253
+ return GGML_CUDA_CC_IS_NVIDIA (cc) && cc >= GGML_CUDA_CC_VOLTA ||
254
+ GGML_CUDA_CC_IS_CDNA (cc) || GGML_CUDA_CC_IS_RDNA3 (cc) ;
243
255
}
244
256
245
257
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
246
258
static bool new_mma_available (const int cc) {
247
- return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_TURING;
259
+ return GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_TURING;
248
260
}
249
261
250
262
static bool cp_async_available (const int cc) {
0 commit comments