Skip to content

[MLIR][OpenMP] Enable multiple variables for target teams reductions #134903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 9, 2025

Conversation

jsjodin
Copy link
Contributor

@jsjodin jsjodin commented Apr 8, 2025

This patch enables multiple reductions to be used in a reduction clause inside target regions for GPU offloading.

This patch enables multiple reductions to be used in a reduction clause inside
target regions for GPU offloading.
@llvmbot
Copy link
Member

llvmbot commented Apr 8, 2025

@llvm/pr-subscribers-mlir-openmp
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-flang-openmp

Author: Jan Leyonberg (jsjodin)

Changes

This patch enables multiple reductions to be used in a reduction clause inside target regions for GPU offloading.


Full diff: https://github.com/llvm/llvm-project/pull/134903.diff

2 Files Affected:

  • (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+10-6)
  • (added) mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir (+127)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8d1cc9b10a950..454a5ec66d280 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4688,12 +4688,18 @@ static uint64_t getTypeByteSize(mlir::Type type, const DataLayout &dl) {
 template <typename OpTy>
 static uint64_t getReductionDataSize(OpTy &op) {
   if (op.getNumReductionVars() > 0) {
-    assert(op.getNumReductionVars() == 1 &&
-           "Only 1 reduction variable currently supported");
-    mlir::Type reductionVarTy = op.getReductionVars()[0].getType();
+    SmallVector<omp::DeclareReductionOp> reductions;
+    collectReductionDecls(op, reductions);
+
+    llvm::SmallVector<mlir::Type> members;
+    for (omp::DeclareReductionOp &red : reductions) {
+      members.push_back(red.getType());
+    }
     Operation *opp = op.getOperation();
+    auto structType = mlir::LLVM::LLVMStructType::getLiteral(
+        opp->getContext(), members, /*isPacked=*/false);
     DataLayout dl = DataLayout(opp->getParentOfType<ModuleOp>());
-    return getTypeByteSize(reductionVarTy, dl);
+    return getTypeByteSize(structType, dl);
   }
   return 0;
 }
@@ -4789,8 +4795,6 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
       (maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal))
     combinedMaxThreadsVal = maxThreadsVal;
 
-  // Calculate reduction data size, limited to single reduction variable for
-  // now.
   int32_t reductionDataSize = 0;
   if (isGPU && capturedOp) {
     if (auto teamsOp = castOrGetParentOfType<omp::TeamsOp>(capturedOp))
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
new file mode 100644
index 0000000000000..1a4c081414c9e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
@@ -0,0 +1,127 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Only check the overall shape of the code and the presence of relevant
+// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
+  omp.private {type = private} @_QFEj_private_i32 : i32
+  omp.declare_reduction @add_reduction_f32 : f32 init {
+  ^bb0(%arg0: f32):
+    %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+    omp.yield(%0 : f32)
+  } combiner {
+  ^bb0(%arg0: f32, %arg1: f32):
+    %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath<contract>} : f32
+    omp.yield(%0 : f32)
+  }
+  omp.declare_reduction @add_reduction_f64 : f64 init {
+  ^bb0(%arg0: f64):
+    %0 = llvm.mlir.constant(0.000000e+00 : f64) : f64
+    omp.yield(%0 : f64)
+  } combiner {
+  ^bb0(%arg0: f64, %arg1: f64):
+    %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath<contract>} : f64
+    omp.yield(%0 : f64)
+  }
+  llvm.func @_QQmain() attributes {fir.bindc_name = "reduction", frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "gfx1030", target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx10-3-insts", "+gfx10-insts", "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize32"]>} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "k"} : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5>
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5>
+    %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+    %9 = llvm.mlir.constant(1 : i64) : i64
+    %10 = llvm.alloca %9 x f32 {bindc_name = "ce4"} : (i64) -> !llvm.ptr<5>
+    %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr
+    %12 = llvm.mlir.constant(1 : i64) : i64
+    %13 = llvm.alloca %12 x f32 {bindc_name = "ce3"} : (i64) -> !llvm.ptr<5>
+    %14 = llvm.addrspacecast %13 : !llvm.ptr<5> to !llvm.ptr
+    %15 = llvm.mlir.constant(1 : i64) : i64
+    %16 = llvm.alloca %15 x f64 {bindc_name = "ce2"} : (i64) -> !llvm.ptr<5>
+    %17 = llvm.addrspacecast %16 : !llvm.ptr<5> to !llvm.ptr
+    %18 = llvm.mlir.constant(1 : i64) : i64
+    %19 = llvm.alloca %18 x f64 {bindc_name = "ce1"} : (i64) -> !llvm.ptr<5>
+    %20 = llvm.addrspacecast %19 : !llvm.ptr<5> to !llvm.ptr
+    %21 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+    %22 = llvm.mlir.constant(0.000000e+00 : f64) : f64
+    %23 = llvm.mlir.constant(1 : i64) : i64
+    %24 = llvm.mlir.constant(1 : i64) : i64
+    %25 = llvm.mlir.constant(1 : i64) : i64
+    %26 = llvm.mlir.constant(1 : i64) : i64
+    %27 = llvm.mlir.constant(1 : i64) : i64
+    %28 = llvm.mlir.constant(1 : i64) : i64
+    %29 = llvm.mlir.constant(1 : i64) : i64
+    llvm.store %22, %20 : f64, !llvm.ptr
+    llvm.store %22, %17 : f64, !llvm.ptr
+    llvm.store %21, %14 : f32, !llvm.ptr
+    llvm.store %21, %11 : f32, !llvm.ptr
+    %30 = omp.map.info var_ptr(%20 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce1"}
+    %31 = omp.map.info var_ptr(%17 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce2"}
+    %32 = omp.map.info var_ptr(%14 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce3"}
+    %33 = omp.map.info var_ptr(%11 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce4"}
+    %34 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"}
+    omp.target map_entries(%30 -> %arg0, %31 -> %arg1, %32 -> %arg2, %33 -> %arg3, %34 -> %arg4 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %35 = llvm.mlir.constant(1.000000e+00 : f32) : f32
+      %36 = llvm.mlir.constant(1.000000e+00 : f64) : f64
+      %37 = llvm.mlir.constant(1000 : i32) : i32
+      %38 = llvm.mlir.constant(1 : i32) : i32
+      omp.teams reduction(@add_reduction_f64 %arg0 -> %arg5, @add_reduction_f64 %arg1 -> %arg6, @add_reduction_f32 %arg2 -> %arg7, @add_reduction_f32 %arg3 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+        omp.parallel {
+          omp.distribute {
+            omp.wsloop reduction(@add_reduction_f64 %arg5 -> %arg9, @add_reduction_f64 %arg6 -> %arg10, @add_reduction_f32 %arg7 -> %arg11, @add_reduction_f32 %arg8 -> %arg12 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+              omp.simd private(@_QFEj_private_i32 %arg4 -> %arg13 : !llvm.ptr) reduction(@add_reduction_f64 %arg9 -> %arg14, @add_reduction_f64 %arg10 -> %arg15, @add_reduction_f32 %arg11 -> %arg16, @add_reduction_f32 %arg12 -> %arg17 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+                omp.loop_nest (%arg18) : i32 = (%38) to (%37) inclusive step (%38) {
+                  llvm.store %arg18, %arg13 : i32, !llvm.ptr
+                  %39 = llvm.load %arg14 : !llvm.ptr -> f64
+                  %40 = llvm.fadd %39, %36 {fastmathFlags = #llvm.fastmath<contract>} : f64
+                  llvm.store %40, %arg14 : f64, !llvm.ptr
+                  %41 = llvm.load %arg15 : !llvm.ptr -> f64
+                  %42 = llvm.fadd %41, %36 {fastmathFlags = #llvm.fastmath<contract>} : f64
+                  llvm.store %42, %arg15 : f64, !llvm.ptr
+                  %43 = llvm.load %arg16 : !llvm.ptr -> f32
+                  %44 = llvm.fadd %43, %35 {fastmathFlags = #llvm.fastmath<contract>} : f32
+                  llvm.store %44, %arg16 : f32, !llvm.ptr
+                  %45 = llvm.load %arg17 : !llvm.ptr -> f32
+                  %46 = llvm.fadd %45, %35 {fastmathFlags = #llvm.fastmath<contract>} : f32
+                  llvm.store %46, %arg17 : f32, !llvm.ptr
+                  omp.yield
+                }
+              } {omp.composite}
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: kernel_environment =
+// CHECK-SAME: i32 24, i32 1024
+// CHECK: call void @[[OUTLINED:__omp_offloading_[A-Za-z0-9_.]*]]
+// CHECK: %[[MASTER:.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2
+// CHECK: icmp eq i32 %[[MASTER]], 1
+// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
+// CHECK: [[THEN]]:
+// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]]
+// CHECK-NEXT: store double %[[FINAL_RESULT0]]
+// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]]
+// CHECK-NEXT: store double %[[FINAL_RESULT1]]
+// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]]
+// CHECK-NEXT: store float %[[FINAL_RESULT2]]
+// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]]
+// CHECK-NEXT: store float %[[FINAL_RESULT3]]

@llvmbot
Copy link
Member

llvmbot commented Apr 8, 2025

@llvm/pr-subscribers-mlir-llvm

Author: Jan Leyonberg (jsjodin)

Changes

This patch enables multiple reductions to be used in a reduction clause inside target regions for GPU offloading.


Full diff: https://github.com/llvm/llvm-project/pull/134903.diff

2 Files Affected:

  • (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+10-6)
  • (added) mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir (+127)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8d1cc9b10a950..454a5ec66d280 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4688,12 +4688,18 @@ static uint64_t getTypeByteSize(mlir::Type type, const DataLayout &dl) {
 template <typename OpTy>
 static uint64_t getReductionDataSize(OpTy &op) {
   if (op.getNumReductionVars() > 0) {
-    assert(op.getNumReductionVars() == 1 &&
-           "Only 1 reduction variable currently supported");
-    mlir::Type reductionVarTy = op.getReductionVars()[0].getType();
+    SmallVector<omp::DeclareReductionOp> reductions;
+    collectReductionDecls(op, reductions);
+
+    llvm::SmallVector<mlir::Type> members;
+    for (omp::DeclareReductionOp &red : reductions) {
+      members.push_back(red.getType());
+    }
     Operation *opp = op.getOperation();
+    auto structType = mlir::LLVM::LLVMStructType::getLiteral(
+        opp->getContext(), members, /*isPacked=*/false);
     DataLayout dl = DataLayout(opp->getParentOfType<ModuleOp>());
-    return getTypeByteSize(reductionVarTy, dl);
+    return getTypeByteSize(structType, dl);
   }
   return 0;
 }
@@ -4789,8 +4795,6 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
       (maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal))
     combinedMaxThreadsVal = maxThreadsVal;
 
-  // Calculate reduction data size, limited to single reduction variable for
-  // now.
   int32_t reductionDataSize = 0;
   if (isGPU && capturedOp) {
     if (auto teamsOp = castOrGetParentOfType<omp::TeamsOp>(capturedOp))
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
new file mode 100644
index 0000000000000..1a4c081414c9e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
@@ -0,0 +1,127 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Only check the overall shape of the code and the presence of relevant
+// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
+  omp.private {type = private} @_QFEj_private_i32 : i32
+  omp.declare_reduction @add_reduction_f32 : f32 init {
+  ^bb0(%arg0: f32):
+    %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+    omp.yield(%0 : f32)
+  } combiner {
+  ^bb0(%arg0: f32, %arg1: f32):
+    %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath<contract>} : f32
+    omp.yield(%0 : f32)
+  }
+  omp.declare_reduction @add_reduction_f64 : f64 init {
+  ^bb0(%arg0: f64):
+    %0 = llvm.mlir.constant(0.000000e+00 : f64) : f64
+    omp.yield(%0 : f64)
+  } combiner {
+  ^bb0(%arg0: f64, %arg1: f64):
+    %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath<contract>} : f64
+    omp.yield(%0 : f64)
+  }
+  llvm.func @_QQmain() attributes {fir.bindc_name = "reduction", frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "gfx1030", target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx10-3-insts", "+gfx10-insts", "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize32"]>} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "k"} : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5>
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5>
+    %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+    %9 = llvm.mlir.constant(1 : i64) : i64
+    %10 = llvm.alloca %9 x f32 {bindc_name = "ce4"} : (i64) -> !llvm.ptr<5>
+    %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr
+    %12 = llvm.mlir.constant(1 : i64) : i64
+    %13 = llvm.alloca %12 x f32 {bindc_name = "ce3"} : (i64) -> !llvm.ptr<5>
+    %14 = llvm.addrspacecast %13 : !llvm.ptr<5> to !llvm.ptr
+    %15 = llvm.mlir.constant(1 : i64) : i64
+    %16 = llvm.alloca %15 x f64 {bindc_name = "ce2"} : (i64) -> !llvm.ptr<5>
+    %17 = llvm.addrspacecast %16 : !llvm.ptr<5> to !llvm.ptr
+    %18 = llvm.mlir.constant(1 : i64) : i64
+    %19 = llvm.alloca %18 x f64 {bindc_name = "ce1"} : (i64) -> !llvm.ptr<5>
+    %20 = llvm.addrspacecast %19 : !llvm.ptr<5> to !llvm.ptr
+    %21 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+    %22 = llvm.mlir.constant(0.000000e+00 : f64) : f64
+    %23 = llvm.mlir.constant(1 : i64) : i64
+    %24 = llvm.mlir.constant(1 : i64) : i64
+    %25 = llvm.mlir.constant(1 : i64) : i64
+    %26 = llvm.mlir.constant(1 : i64) : i64
+    %27 = llvm.mlir.constant(1 : i64) : i64
+    %28 = llvm.mlir.constant(1 : i64) : i64
+    %29 = llvm.mlir.constant(1 : i64) : i64
+    llvm.store %22, %20 : f64, !llvm.ptr
+    llvm.store %22, %17 : f64, !llvm.ptr
+    llvm.store %21, %14 : f32, !llvm.ptr
+    llvm.store %21, %11 : f32, !llvm.ptr
+    %30 = omp.map.info var_ptr(%20 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce1"}
+    %31 = omp.map.info var_ptr(%17 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce2"}
+    %32 = omp.map.info var_ptr(%14 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce3"}
+    %33 = omp.map.info var_ptr(%11 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce4"}
+    %34 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"}
+    omp.target map_entries(%30 -> %arg0, %31 -> %arg1, %32 -> %arg2, %33 -> %arg3, %34 -> %arg4 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %35 = llvm.mlir.constant(1.000000e+00 : f32) : f32
+      %36 = llvm.mlir.constant(1.000000e+00 : f64) : f64
+      %37 = llvm.mlir.constant(1000 : i32) : i32
+      %38 = llvm.mlir.constant(1 : i32) : i32
+      omp.teams reduction(@add_reduction_f64 %arg0 -> %arg5, @add_reduction_f64 %arg1 -> %arg6, @add_reduction_f32 %arg2 -> %arg7, @add_reduction_f32 %arg3 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+        omp.parallel {
+          omp.distribute {
+            omp.wsloop reduction(@add_reduction_f64 %arg5 -> %arg9, @add_reduction_f64 %arg6 -> %arg10, @add_reduction_f32 %arg7 -> %arg11, @add_reduction_f32 %arg8 -> %arg12 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+              omp.simd private(@_QFEj_private_i32 %arg4 -> %arg13 : !llvm.ptr) reduction(@add_reduction_f64 %arg9 -> %arg14, @add_reduction_f64 %arg10 -> %arg15, @add_reduction_f32 %arg11 -> %arg16, @add_reduction_f32 %arg12 -> %arg17 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+                omp.loop_nest (%arg18) : i32 = (%38) to (%37) inclusive step (%38) {
+                  llvm.store %arg18, %arg13 : i32, !llvm.ptr
+                  %39 = llvm.load %arg14 : !llvm.ptr -> f64
+                  %40 = llvm.fadd %39, %36 {fastmathFlags = #llvm.fastmath<contract>} : f64
+                  llvm.store %40, %arg14 : f64, !llvm.ptr
+                  %41 = llvm.load %arg15 : !llvm.ptr -> f64
+                  %42 = llvm.fadd %41, %36 {fastmathFlags = #llvm.fastmath<contract>} : f64
+                  llvm.store %42, %arg15 : f64, !llvm.ptr
+                  %43 = llvm.load %arg16 : !llvm.ptr -> f32
+                  %44 = llvm.fadd %43, %35 {fastmathFlags = #llvm.fastmath<contract>} : f32
+                  llvm.store %44, %arg16 : f32, !llvm.ptr
+                  %45 = llvm.load %arg17 : !llvm.ptr -> f32
+                  %46 = llvm.fadd %45, %35 {fastmathFlags = #llvm.fastmath<contract>} : f32
+                  llvm.store %46, %arg17 : f32, !llvm.ptr
+                  omp.yield
+                }
+              } {omp.composite}
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: kernel_environment =
+// CHECK-SAME: i32 24, i32 1024
+// CHECK: call void @[[OUTLINED:__omp_offloading_[A-Za-z0-9_.]*]]
+// CHECK: %[[MASTER:.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2
+// CHECK: icmp eq i32 %[[MASTER]], 1
+// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
+// CHECK: [[THEN]]:
+// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]]
+// CHECK-NEXT: store double %[[FINAL_RESULT0]]
+// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]]
+// CHECK-NEXT: store double %[[FINAL_RESULT1]]
+// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]]
+// CHECK-NEXT: store float %[[FINAL_RESULT2]]
+// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]]
+// CHECK-NEXT: store float %[[FINAL_RESULT3]]

Copy link
Member

@skatrak skatrak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you Jan, LGTM

…n.cpp


Reserve space.

Co-authored-by: Sergio Afonso <safonsof@amd.com>
@jsjodin jsjodin merged commit 1aed6ad into llvm:main Apr 9, 2025
11 checks passed
AllinLeeYL pushed a commit to AllinLeeYL/llvm-project that referenced this pull request Apr 10, 2025
…lvm#134903)

This patch enables multiple reductions to be used in a reduction clause
inside target regions for GPU offloading.

---------

Co-authored-by: Sergio Afonso <safonsof@amd.com>
var-const pushed a commit to ldionne/llvm-project that referenced this pull request Apr 17, 2025
…lvm#134903)

This patch enables multiple reductions to be used in a reduction clause
inside target regions for GPU offloading.

---------

Co-authored-by: Sergio Afonso <safonsof@amd.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants