diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td index 239a5f1e2bc29..a0c0d4cfd8714 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -671,7 +671,7 @@ def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegment tile shape. The descriptor is created by `nvgpu.tma.create.descriptor` }]; let arguments = (ins Arg]>:$src, - NVGPU_TensorMapDescriptor:$tensorMapDescriptor, + Arg]>:$tensorMapDescriptor, Variadic:$coordinates, Optional:$predicate); let assemblyFormat = [{ diff --git a/mlir/test/Dialect/NVGPU/canonicalization.mlir b/mlir/test/Dialect/NVGPU/canonicalization.mlir new file mode 100644 index 0000000000000..a7fbfd8067395 --- /dev/null +++ b/mlir/test/Dialect/NVGPU/canonicalization.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-opt %s | mlir-opt -canonicalize -cse | FileCheck %s + +gpu.module @main_kernel { + +// CHECK-LABEL: @main_kernel( +// CHECK-SAME: %[[arg0:.*]]: !nvgpu.tensormap.descriptor + gpu.func @main_kernel(%arg0: !nvgpu.tensormap.descriptor< + tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, + oob = zero, interleave = none>) kernel attributes + { gpu.known_block_size = array, + gpu.known_grid_size = array + } + { + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[S0:.+]] = gpu.thread_id x + // CHECK: %[[S1:.+]] = arith.cmpi eq, %[[S0]], %[[c0]] : index + // CHECK: %[[S2:.+]] = gpu.dynamic_shared_memory : memref> + // CHECK: %[[S3:.+]] = memref.view %[[S2]][%[[c0]]][] : memref> to memref<128x32xf32, #gpu.address_space> + // CHECK: nvgpu.tma.async.store %[[S3]] to %[[arg0]][%[[c0]], %[[c0]]], predicate = %[[S1]] : memref<128x32xf32, #gpu.address_space> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> + %c0 = arith.constant 0 : index + %0 = gpu.thread_id x + %1 = arith.cmpi eq, %0, %c0 : index + %2 = gpu.dynamic_shared_memory : memref> + %view = memref.view %2[%c0][] : memref> to memref<128x32xf32, #gpu.address_space> + nvgpu.tma.async.store %view to %arg0[%c0, %c0], predicate = %1 : memref<128x32xf32, #gpu.address_space> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> + nvvm.cp.async.bulk.commit.group + nvvm.cp.async.bulk.wait_group 0 + gpu.return + } +} \ No newline at end of file