[MLIR][NVVM] Extend TMA Bulk Copy Op #140232

durga4github · May 16, 2025

This patch extends the non-tensor TMA Bulk Copy Op
(from shared_cta to global) with an optional
byte mask operand. This mask helps selectively
copy a particular byte to the destination.

lit tests are added to verify the lowering to the intrinsics.

llvmbot · May 16, 2025

@llvm/pr-subscribers-mlir-llvm

@llvm/pr-subscribers-mlir

Author: Durgadoss R (durga4github)

Changes

This patch extends the non-tensor TMA Bulk Copy Op
(from shared_cta to global) with an optional
byte mask operand. This mask helps selectively
copy a particular byte to the destination.

lit tests are added to verify the lowering to the intrinsics.

Full diff: https://github.com/llvm/llvm-project/pull/140232.diff

3 Files Affected:

(modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+23-27)
(modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+25)
(modified) mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir (+11-1)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index a8e7dcb54ac20..b46faaf4e8668 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2599,15 +2599,18 @@ def NVVM_CpAsyncBulkSharedCTAToSharedClusterOp :
 }
 
 def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
-  NVVM_Op<"cp.async.bulk.global.shared.cta"> {
+  NVVM_Op<"cp.async.bulk.global.shared.cta", [AttrSizedOperandSegments]> {
   let summary = "Async bulk copy from Shared CTA memory to Global memory";
   let description = [{
     Initiates an asynchronous copy operation from Shared CTA memory to
-    global memory.
+    global memory. The 32-bit operand `size` specifies the amount of
+    memory to be copied, in terms of number of bytes. `size` must be a
+    multiple of 16. The `l2CacheHint` operand is optional, and it is used
+    to specify cache eviction policy that may be used during the memory
+    access. The i-th bit in the 16-bit wide `byteMask` operand specifies
+    whether the i-th byte of each 16-byte wide chunk of source data is
+    copied to the destination. If the bit is set, the byte is copied.
 
-    The `l2CacheHint` operand is optional, and it is used to specify cache
-    eviction policy that may be used during the memory access.
-    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
   }];
 
@@ -2615,35 +2618,28 @@ def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
     LLVM_PointerGlobal:$dstMem,
     LLVM_PointerShared:$srcMem,
     I32:$size,
-    Optional<I64>:$l2CacheHint);
+    Optional<I64>:$l2CacheHint,
+    Optional<I16>:$byteMask);
 
   let assemblyFormat = [{
     $dstMem `,` $srcMem `,` $size
     (`l2_cache_hint` `=` $l2CacheHint^ )?
-    attr-dict  `:` type($dstMem) `,` type($srcMem)
+    (`byte_mask` `=` $byteMask^ )?
+    attr-dict `:` type($dstMem) `,` type($srcMem)
   }];
 
+  let extraClassDeclaration = [{
+    static llvm::Intrinsic::ID
+      getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                            llvm::SmallVector<llvm::Value *> &args,
+                            llvm::IRBuilderBase& builder);
+  }];
   string llvmBuilder = [{
-    // Arguments to the intrinsic:
-    // dst, src, size, cache_hint,
-    // Flag for cache_hint
-    //
-    llvm::SmallVector<llvm::Value *> translatedOperands;
-    translatedOperands.push_back($dstMem);
-    translatedOperands.push_back($srcMem);
-    translatedOperands.push_back($size);
-
-    // Cachehint, if available
-    llvm::LLVMContext &ctx = moduleTranslation.getLLVMContext();
-    auto *i64Unused = llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0);
-    bool isCacheHint = op.getL2CacheHint() ? true : false;
-    translatedOperands.push_back(isCacheHint ? $l2CacheHint : i64Unused);
-
-    // Flag argument for cachehint
-    translatedOperands.push_back(builder.getInt1(isCacheHint));
-
-    createIntrinsicCall(builder,
-      llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global, translatedOperands);
+    llvm::SmallVector<llvm::Value *> args;
+    llvm::Intrinsic::ID id =
+        NVVM::CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
+            *op, moduleTranslation, args, builder);
+    createIntrinsicCall(builder, id, args);
   }];
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 1ea3f96fa75f5..50fab5a1b3d87 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1253,6 +1253,31 @@ CpAsyncOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
   return id;
 }
 
+llvm::Intrinsic::ID CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt,
+    llvm::SmallVector<llvm::Value *> &args, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::CpAsyncBulkSharedCTAToGlobalOp>(op);
+
+  // Fill the Intrinsic Args
+  args.push_back(mt.lookupValue(thisOp.getDstMem()));
+  args.push_back(mt.lookupValue(thisOp.getSrcMem()));
+  args.push_back(mt.lookupValue(thisOp.getSize()));
+
+  auto cacheHint = thisOp.getL2CacheHint();
+  const bool hasCacheHint = static_cast<bool>(cacheHint);
+  auto *i64Unused =
+      llvm::ConstantInt::get(llvm::Type::getInt64Ty(mt.getLLVMContext()), 0);
+  args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Unused);
+  args.push_back(builder.getInt1(hasCacheHint));
+
+  if (auto byteMask = thisOp.getByteMask()) {
+    args.push_back(mt.lookupValue(byteMask));
+    return llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global_bytemask;
+  }
+
+  return llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global;
+}
+
 llvm::Intrinsic::ID CpAsyncBulkTensorPrefetchOp::getIntrinsicID(int tensorDims,
                                                                 bool isIm2Col) {
   switch (tensorDims) {
diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir
index 39b703d9a9677..0daf24536a672 100644
--- a/mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir
@@ -26,9 +26,19 @@ llvm.func @llvm_nvvm_cp_async_bulk_shared_cta_to_shared_cluster(%dst : !llvm.ptr
 // CHECK-LABEL: @llvm_nvvm_cp_async_bulk_shared_cta_to_global
 llvm.func @llvm_nvvm_cp_async_bulk_shared_cta_to_global(%dst : !llvm.ptr<1>, %src : !llvm.ptr<3>, %size : i32, %ch : i64) {
   // CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %[[DST:.*]], ptr addrspace(3) %[[SRC:.*]], i32 %[[SIZE:.*]], i64 0, i1 false)
-  // CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %[[DST:.*]], ptr addrspace(3) %[[SRC:.*]], i32 %[[SIZE:.*]], i64 %[[CH:.*]], i1 true)
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %[[DST]], ptr addrspace(3) %[[SRC]], i32 %[[SIZE]], i64 %[[CH:.*]], i1 true)
   nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size : !llvm.ptr<1>, !llvm.ptr<3>
 
   nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch : !llvm.ptr<1>, !llvm.ptr<3>
   llvm.return
 }
+
+// CHECK-LABEL: @llvm_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
+llvm.func @llvm_nvvm_cp_async_bulk_shared_cta_to_global_bytemask(%dst : !llvm.ptr<1>, %src : !llvm.ptr<3>, %size : i32, %ch : i64, %mask : i16) {
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %[[DST:.*]], ptr addrspace(3) %[[SRC:.*]], i32 %[[SIZE:.*]], i64 0, i1 false, i16 %[[MASK:.*]])
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %[[DST]], ptr addrspace(3) %[[SRC]], i32 %[[SIZE]], i64 %[[CH:.*]], i1 true, i16 %[[MASK]])
+  nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size byte_mask = %mask : !llvm.ptr<1>, !llvm.ptr<3>
+
+  nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch byte_mask = %mask : !llvm.ptr<1>, !llvm.ptr<3>
+  llvm.return
+}

grypp · May 16, 2025

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

  }];

+  let extraClassDeclaration = [{
+    static llvm::Intrinsic::ID


add doc-string

Done, added in the latest revision.

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

github-actions · May 16, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

This patch extends the non-tensor TMA Bulk Copy Op (from shared_cta to global) with an optional byte mask operand. This mask helps in selectively copying a particular byte to the destination. * lit tests are added to verify the lowering to the intrinsics. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>

This patch extends the non-tensor TMA Bulk Copy Op (from shared_cta to global) with an optional byte mask operand. This mask helps selectively copy a particular byte to the destination. * lit tests are added to verify the lowering to the intrinsics. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>

durga4github requested a review from grypp as a code owner May 16, 2025 10:10

llvmbot added mlir:llvm mlir labels May 16, 2025

durga4github force-pushed the durgadossr/mlir_tma_copy_bytemask branch from a227f45 to 9a83831 Compare May 16, 2025 10:13

grypp reviewed May 16, 2025

View reviewed changes

durga4github force-pushed the durgadossr/mlir_tma_copy_bytemask branch from 9a83831 to 3d0efbd Compare May 16, 2025 12:15

durga4github force-pushed the durgadossr/mlir_tma_copy_bytemask branch 2 times, most recently from 48bf8e4 to 8d1ad70 Compare May 16, 2025 12:42

grypp reviewed May 17, 2025

View reviewed changes

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp Outdated Show resolved Hide resolved

durga4github force-pushed the durgadossr/mlir_tma_copy_bytemask branch from 8d1ad70 to a7144d6 Compare May 17, 2025 13:10

grypp approved these changes May 19, 2025

View reviewed changes

durga4github merged commit 2483831 into llvm:main May 19, 2025
11 checks passed

durga4github deleted the durgadossr/mlir_tma_copy_bytemask branch May 19, 2025 08:54

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[MLIR][NVVM] Extend TMA Bulk Copy Op #140232

[MLIR][NVVM] Extend TMA Bulk Copy Op #140232

Uh oh!

durga4github commented May 16, 2025 •

edited

Loading

Uh oh!

llvmbot commented May 16, 2025 •

edited

Loading

Uh oh!

grypp May 16, 2025

Uh oh!

durga4github May 16, 2025

Uh oh!

Uh oh!

github-actions bot commented May 16, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Search code, repositories, users, issues, pull requests...

[MLIR][NVVM] Extend TMA Bulk Copy Op #140232

[MLIR][NVVM] Extend TMA Bulk Copy Op #140232

Uh oh!

Conversation

durga4github commented May 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

grypp May 16, 2025

Choose a reason for hiding this comment

Uh oh!

durga4github May 16, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented May 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

durga4github commented May 16, 2025 •

edited

Loading

llvmbot commented May 16, 2025 •

edited

Loading

github-actions bot commented May 16, 2025 •

edited

Loading