pytorch
diff --git a/‎aten/src/ATen/native/cuda/RowwiseScaledMM.cu
Copy file name to clipboardExpand all lines: aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+4-38Lines changed: 4 additions & 38 deletions b/‎aten/src/ATen/native/cuda/RowwiseScaledMM.cu
Copy file name to clipboardExpand all lines: aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+4-38Lines changed: 4 additions & 38 deletions
diff --git a/‎third_party/cutlass
Copy file name to clipboard b/‎third_party/cutlass
Copy file name to clipboard
@@ -5,58 +5,24 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 // Determine if the architecture supports rowwise scaled mm
-// Currenlty failing on windows with: https://github.com/NVIDIA/cutlass/issues/1571
+// Currently failing on windows with:
+// https://github.com/NVIDIA/cutlass/issues/1571
 #if !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 
 #define BUILD_ROWWISE_FP8_KERNEL
 #endif
 
 #if defined(BUILD_ROWWISE_FP8_KERNEL)
 
-// We are going to override the cuTensorMapEncodeTiled driver api with our lazy loader
-static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
-    CUtensorMap* tensorMap,
-    CUtensorMapDataType tensorDataType,
-    cuuint32_t tensorRank,
-    void* globalAddress,
-    const cuuint64_t* globalDim,
-    const cuuint64_t* globalStrides,
-    const cuuint32_t* boxDim,
-    const cuuint32_t* elementStrides,
-    CUtensorMapInterleave interleave,
-    CUtensorMapSwizzle swizzle,
-    CUtensorMapL2promotion l2Promotion,
-    CUtensorMapFloatOOBfill oobFill) {
-  return at::globalContext().getNVRTC().cuTensorMapEncodeTiled(
-      tensorMap,
-      tensorDataType,
-      tensorRank,
-      globalAddress,
-      globalDim,
-      globalStrides,
-      boxDim,
-      elementStrides,
-      interleave,
-      swizzle,
-      l2Promotion,
-      oobFill);
-}
-
-
-#include <cutlass/version.h>
+#include <cute/tensor.hpp>
 #include <cutlass/core_io.h>
 #include <cutlass/cutlass.h>
 #include <cutlass/gemm/device/gemm.h>
 #include <cutlass/half.h>
 #include <cutlass/numeric_types.h>
 #include <cutlass/trace.h>
 #include <cutlass/util/host_tensor.h>
-
-// Rename the global function symbol
-#define cuTensorMapEncodeTiled nvrtc_cuTensorMapEncodeTiled
-#include <cute/tensor.hpp>
-#undef cuTensorMapEncodeTiled
-// Set everything back to normal
+#include <cutlass/version.h>
 
 #include <cutlass/gemm/collective/collective_builder.hpp>
 #include <cutlass/gemm/device/gemm_universal_adapter.h>