From cc60e5f3362532698fe320f5c5c2cbc2b0ea84d6 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg Date: Tue, 8 Apr 2025 14:27:36 -0400 Subject: [PATCH 1/2] [MLIR][OpenMP] Enable multiple variables for target teams reductions This patch enables multiple reductions to be used in a reduction clause inside target regions for GPU offloading. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 16 ++- .../LLVMIR/omptarget-multi-reduction.mlir | 127 ++++++++++++++++++ 2 files changed, 137 insertions(+), 6 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8d1cc9b10a950..454a5ec66d280 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -4688,12 +4688,18 @@ static uint64_t getTypeByteSize(mlir::Type type, const DataLayout &dl) { template static uint64_t getReductionDataSize(OpTy &op) { if (op.getNumReductionVars() > 0) { - assert(op.getNumReductionVars() == 1 && - "Only 1 reduction variable currently supported"); - mlir::Type reductionVarTy = op.getReductionVars()[0].getType(); + SmallVector reductions; + collectReductionDecls(op, reductions); + + llvm::SmallVector members; + for (omp::DeclareReductionOp &red : reductions) { + members.push_back(red.getType()); + } Operation *opp = op.getOperation(); + auto structType = mlir::LLVM::LLVMStructType::getLiteral( + opp->getContext(), members, /*isPacked=*/false); DataLayout dl = DataLayout(opp->getParentOfType()); - return getTypeByteSize(reductionVarTy, dl); + return getTypeByteSize(structType, dl); } return 0; } @@ -4789,8 +4795,6 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, (maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal)) combinedMaxThreadsVal = maxThreadsVal; - // Calculate reduction data size, limited to single reduction variable for - // now. int32_t reductionDataSize = 0; if (isGPU && capturedOp) { if (auto teamsOp = castOrGetParentOfType(capturedOp)) diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir new file mode 100644 index 0000000000000..1a4c081414c9e --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir @@ -0,0 +1,127 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } { + omp.private {type = private} @_QFEj_private_i32 : i32 + omp.declare_reduction @add_reduction_f32 : f32 init { + ^bb0(%arg0: f32): + %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32 + omp.yield(%0 : f32) + } combiner { + ^bb0(%arg0: f32, %arg1: f32): + %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath} : f32 + omp.yield(%0 : f32) + } + omp.declare_reduction @add_reduction_f64 : f64 init { + ^bb0(%arg0: f64): + %0 = llvm.mlir.constant(0.000000e+00 : f64) : f64 + omp.yield(%0 : f64) + } combiner { + ^bb0(%arg0: f64, %arg1: f64): + %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath} : f64 + omp.yield(%0 : f64) + } + llvm.func @_QQmain() attributes {fir.bindc_name = "reduction", frame_pointer = #llvm.framePointerKind, omp.declare_target = #omp.declaretarget, target_cpu = "gfx1030", target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx10-3-insts", "+gfx10-insts", "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+s-memrealtime", "+s-memtime-inst", "+vmem-to-lds-load-insts", "+wavefrontsize32"]>} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "k"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> + %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr + %9 = llvm.mlir.constant(1 : i64) : i64 + %10 = llvm.alloca %9 x f32 {bindc_name = "ce4"} : (i64) -> !llvm.ptr<5> + %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr + %12 = llvm.mlir.constant(1 : i64) : i64 + %13 = llvm.alloca %12 x f32 {bindc_name = "ce3"} : (i64) -> !llvm.ptr<5> + %14 = llvm.addrspacecast %13 : !llvm.ptr<5> to !llvm.ptr + %15 = llvm.mlir.constant(1 : i64) : i64 + %16 = llvm.alloca %15 x f64 {bindc_name = "ce2"} : (i64) -> !llvm.ptr<5> + %17 = llvm.addrspacecast %16 : !llvm.ptr<5> to !llvm.ptr + %18 = llvm.mlir.constant(1 : i64) : i64 + %19 = llvm.alloca %18 x f64 {bindc_name = "ce1"} : (i64) -> !llvm.ptr<5> + %20 = llvm.addrspacecast %19 : !llvm.ptr<5> to !llvm.ptr + %21 = llvm.mlir.constant(0.000000e+00 : f32) : f32 + %22 = llvm.mlir.constant(0.000000e+00 : f64) : f64 + %23 = llvm.mlir.constant(1 : i64) : i64 + %24 = llvm.mlir.constant(1 : i64) : i64 + %25 = llvm.mlir.constant(1 : i64) : i64 + %26 = llvm.mlir.constant(1 : i64) : i64 + %27 = llvm.mlir.constant(1 : i64) : i64 + %28 = llvm.mlir.constant(1 : i64) : i64 + %29 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %22, %20 : f64, !llvm.ptr + llvm.store %22, %17 : f64, !llvm.ptr + llvm.store %21, %14 : f32, !llvm.ptr + llvm.store %21, %11 : f32, !llvm.ptr + %30 = omp.map.info var_ptr(%20 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce1"} + %31 = omp.map.info var_ptr(%17 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce2"} + %32 = omp.map.info var_ptr(%14 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce3"} + %33 = omp.map.info var_ptr(%11 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "ce4"} + %34 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"} + omp.target map_entries(%30 -> %arg0, %31 -> %arg1, %32 -> %arg2, %33 -> %arg3, %34 -> %arg4 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %35 = llvm.mlir.constant(1.000000e+00 : f32) : f32 + %36 = llvm.mlir.constant(1.000000e+00 : f64) : f64 + %37 = llvm.mlir.constant(1000 : i32) : i32 + %38 = llvm.mlir.constant(1 : i32) : i32 + omp.teams reduction(@add_reduction_f64 %arg0 -> %arg5, @add_reduction_f64 %arg1 -> %arg6, @add_reduction_f32 %arg2 -> %arg7, @add_reduction_f32 %arg3 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) { + omp.parallel { + omp.distribute { + omp.wsloop reduction(@add_reduction_f64 %arg5 -> %arg9, @add_reduction_f64 %arg6 -> %arg10, @add_reduction_f32 %arg7 -> %arg11, @add_reduction_f32 %arg8 -> %arg12 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) { + omp.simd private(@_QFEj_private_i32 %arg4 -> %arg13 : !llvm.ptr) reduction(@add_reduction_f64 %arg9 -> %arg14, @add_reduction_f64 %arg10 -> %arg15, @add_reduction_f32 %arg11 -> %arg16, @add_reduction_f32 %arg12 -> %arg17 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) { + omp.loop_nest (%arg18) : i32 = (%38) to (%37) inclusive step (%38) { + llvm.store %arg18, %arg13 : i32, !llvm.ptr + %39 = llvm.load %arg14 : !llvm.ptr -> f64 + %40 = llvm.fadd %39, %36 {fastmathFlags = #llvm.fastmath} : f64 + llvm.store %40, %arg14 : f64, !llvm.ptr + %41 = llvm.load %arg15 : !llvm.ptr -> f64 + %42 = llvm.fadd %41, %36 {fastmathFlags = #llvm.fastmath} : f64 + llvm.store %42, %arg15 : f64, !llvm.ptr + %43 = llvm.load %arg16 : !llvm.ptr -> f32 + %44 = llvm.fadd %43, %35 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %44, %arg16 : f32, !llvm.ptr + %45 = llvm.load %arg17 : !llvm.ptr -> f32 + %46 = llvm.fadd %45, %35 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %46, %arg17 : f32, !llvm.ptr + omp.yield + } + } {omp.composite} + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: kernel_environment = +// CHECK-SAME: i32 24, i32 1024 +// CHECK: call void @[[OUTLINED:__omp_offloading_[A-Za-z0-9_.]*]] +// CHECK: %[[MASTER:.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2 +// CHECK: icmp eq i32 %[[MASTER]], 1 +// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] +// CHECK: [[THEN]]: +// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]] +// CHECK-NEXT: store double %[[FINAL_RESULT0]] +// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]] +// CHECK-NEXT: store double %[[FINAL_RESULT1]] +// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]] +// CHECK-NEXT: store float %[[FINAL_RESULT2]] +// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]] +// CHECK-NEXT: store float %[[FINAL_RESULT3]] From 2568066078dc24c7aea42becdf3ad04440c470ed Mon Sep 17 00:00:00 2001 From: Jan Leyonberg Date: Wed, 9 Apr 2025 11:03:30 -0400 Subject: [PATCH 2/2] Update mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp Reserve space. Co-authored-by: Sergio Afonso --- .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 454a5ec66d280..f290be932b54e 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -4692,9 +4692,9 @@ static uint64_t getReductionDataSize(OpTy &op) { collectReductionDecls(op, reductions); llvm::SmallVector members; - for (omp::DeclareReductionOp &red : reductions) { + members.reserve(reductions.size()); + for (omp::DeclareReductionOp &red : reductions) members.push_back(red.getType()); - } Operation *opp = op.getOperation(); auto structType = mlir::LLVM::LLVMStructType::getLiteral( opp->getContext(), members, /*isPacked=*/false);