-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause #134709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause #134709
Conversation
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang Author: CHANDRA GHALE (chandraghale) ChangesCodegen support for reduction over private variable with reduction clause. Section 7.6.10 in in OpenMP 6.0 spec.
Patch is 24.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134709.diff 4 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 06a652c146fb9..3424227e5da79 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4899,6 +4899,150 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
}
}
+void CGOpenMPRuntime::emitPrivateReduction(
+ CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
+ ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
+ ArrayRef<const Expr *> ReductionOps) {
+
+ if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
+ return;
+
+ if (LHSExprs.size() != Privates.size() ||
+ LHSExprs.size() != ReductionOps.size())
+ return;
+
+ QualType PrivateType = Privates[0]->getType();
+ llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
+
+ BinaryOperatorKind MainBO = BO_Comma;
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[0])) {
+ if (const auto *RHSExpr = BinOp->getRHS()) {
+ if (const auto *BORHS =
+ dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ MainBO = BORHS->getOpcode();
+ }
+ }
+ }
+
+ llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType);
+ const Expr *Private = Privates[0];
+
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Private)) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ if (const Expr *Init = VD->getInit()) {
+ if (Init->isConstantInitializer(CGF.getContext(), false)) {
+ Expr::EvalResult Result;
+ if (Init->EvaluateAsRValue(Result, CGF.getContext())) {
+ APValue &InitValue = Result.Val;
+ if (InitValue.isInt()) {
+ InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Create an internal shared variable
+ std::string SharedName = getName({"internal_private_var"});
+ llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage,
+ InitVal, ".omp.reduction." + SharedName, nullptr,
+ llvm::GlobalVariable::NotThreadLocal);
+
+ SharedVar->setAlignment(
+ llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
+
+ Address SharedResult(SharedVar, SharedVar->getValueType(),
+ CGF.getContext().getTypeAlignInChars(PrivateType));
+
+ llvm::Value *ThreadId = getThreadID(CGF, Loc);
+ llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
+ llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
+ llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
+
+ llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
+ ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
+ CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
+
+ CGF.EmitBlock(InitBB);
+ CGF.Builder.CreateStore(InitVal, SharedResult);
+ CGF.Builder.CreateBr(InitEndBB);
+
+ CGF.EmitBlock(InitEndBB);
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ for (unsigned I = 0; I < ReductionOps.size(); ++I) {
+ if (I >= LHSExprs.size()) {
+ break;
+ }
+
+ const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]);
+ if (!BinOp || BinOp->getOpcode() != BO_Assign)
+ continue;
+
+ const Expr *RHSExpr = BinOp->getRHS();
+ if (!RHSExpr)
+ continue;
+
+ BinaryOperatorKind BO = BO_Comma;
+ if (const auto *BORHS =
+ dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ BO = BORHS->getOpcode();
+ }
+
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
+ auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) {
+ if (BO == BO_Mul) {
+ llvm::Value *OldScalar = OldVal.getScalarVal();
+ llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
+ llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar);
+ return RValue::get(Result);
+ } else {
+ OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
+ BinOp->getLHS()->getType(),
+ ExprValueKind::VK_PRValue);
+ CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal);
+ return CGF.EmitAnyExpr(BinOp->getRHS());
+ }
+ };
+
+ (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
+ SharedLV, PrivateRV, BO, true,
+ llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ }
+
+ // Final barrier
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ // Broadcast final result
+ llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult);
+
+ // Update private variables with final result
+ for (unsigned I = 0; I < Privates.size(); ++I) {
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress());
+ }
+
+ // Final synchronization
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+}
+
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
@@ -5201,6 +5345,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+ if (Options.IsPrivateVarReduction) {
+ emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps);
+ }
}
/// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 4321712e1521d..50ba28b565b6d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1201,8 +1201,22 @@ class CGOpenMPRuntime {
struct ReductionOptionsTy {
bool WithNowait;
bool SimpleReduction;
+ bool IsPrivateVarReduction;
OpenMPDirectiveKind ReductionKind;
};
+
+ /// Emits code for private variable reduction
+ /// \param Privates List of private copies for original reduction arguments.
+ /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+ /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+ /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+ /// or 'operator binop(LHS, RHS)'.
+ void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
+ ArrayRef<const Expr *> Privates,
+ ArrayRef<const Expr *> LHSExprs,
+ ArrayRef<const Expr *> RHSExprs,
+ ArrayRef<const Expr *> ReductionOps);
+
/// Emit a code for reduction clause. Next code should be emitted for
/// reduction:
/// \code
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e4d1db264aac9..720a88e075ddd 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1470,6 +1470,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
llvm::SmallVector<const Expr *, 8> LHSExprs;
llvm::SmallVector<const Expr *, 8> RHSExprs;
llvm::SmallVector<const Expr *, 8> ReductionOps;
+ llvm::SmallVector<bool, 8> IsPrivate;
bool HasAtLeastOneReduction = false;
bool IsReductionWithTaskMod = false;
for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
@@ -1480,6 +1481,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
Privates.append(C->privates().begin(), C->privates().end());
LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+ IsPrivate.append(C->private_var_reduction_flags().begin(),
+ C->private_var_reduction_flags().end());
ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
IsReductionWithTaskMod =
IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
@@ -1499,9 +1502,11 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
bool SimpleReduction = ReductionKind == OMPD_simd;
// Emit nowait reduction if nowait clause is present or directive is a
// parallel directive (it always has implicit barrier).
+ bool IsPrivateVarReduction =
+ llvm::any_of(IsPrivate, [](bool IsPriv) { return IsPriv; });
CGM.getOpenMPRuntime().emitReduction(
*this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps,
- {WithNowait, SimpleReduction, ReductionKind});
+ {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind});
}
}
@@ -3943,7 +3948,8 @@ static void emitScanBasedDirective(
PrivScope.Privatize();
CGF.CGM.getOpenMPRuntime().emitReduction(
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true,
+ /*IsPrivateVarReduction */ false, OMPD_unknown});
}
llvm::Value *NextIVal =
CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
@@ -5747,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
}
CGM.getOpenMPRuntime().emitReduction(
*this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true, false, OMPD_simd});
for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
const Expr *PrivateExpr = Privates[I];
LValue DestLVal;
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
new file mode 100644
index 0000000000000..be50991ca193e
--- /dev/null
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -0,0 +1,236 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 10
+void do_red(int n, int *v, int &sum_v)
+ {
+ sum_v = 0;
+ #pragma omp for reduction(original(private),+: sum_v)
+ for (int i = 0; i < n; i++)
+ {
+ sum_v += v[i];
+ }
+ }
+ int main(void)
+ {
+ int v[N];
+ for (int i = 0; i < N; i++)
+ v[i] = i;
+ #pragma omp parallel num_threads(4)
+ {
+ int s_v;
+ do_red(N, v, s_v);
+ }
+ return 0;
+ }
+//.
+// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4
+// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+//.
+// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi(
+// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK: [[OMP_PRECOND_THEN]]:
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
+// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8
+// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[T...
[truncated]
|
if (Options.IsPrivateVarReduction) { | ||
emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drop braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done !!
Refactored code to look up the expression and handle UDR, emitting it as is. updated the lit test. |
@alexey-bataev . Reworked on UDR. Will be helpful if you will be able to review further. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to update OpenMPSupport.rst and release notes
@@ -4899,6 +4899,238 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, | ||
} | ||
} | ||
|
||
void CGOpenMPRuntime::emitPrivateReduction( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Add the comments, describing the logic of the function.
- How does it differ from the regular reductions codegen? Can you try to unify the logic to reduce the maintenance cost?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Updated the code with comments and clarified the logic
- Updated OpenMPSupport.rst and the release notes.
- Unified the logic as much as possible.
- In regular reduction codegen, threads update a shared variable directly. With private reduction, each thread first works with its own private copy, and then all these partial results are combined into a shared variable. The final result is copied back to each thread’s original variable
void do_red(int n, int *v, int &sum_v) | ||
{ | ||
sum_v = 0; | ||
#pragma omp for reduction(original(private),+: sum_v) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a test with 2 or 3 reduced values in the same construct
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated !!
openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
Outdated
Show resolved
Hide resolved
float_sum_val_out = 0.0f; | ||
|
||
const float kPiValue = 3.14f; | ||
#pragma omp for reduction(original(private), + : sum_val_out) \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same, use 2 variables
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is nt this already reducing over 2 more variable *:prod_val_out and +:float_sum_val_out.
if (InitExpr && !PrivateType->isAggregateType() && | ||
!PrivateType->isAnyComplexType()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Complex types should be supported, the compiler should not drop it silently
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done .. Added support for all types.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add at least a runtime test with complex types, if possible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated runtime test case a complex type test.
const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); | ||
if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr)) | ||
ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); | ||
|
||
const Expr *AssignRHS = nullptr; | ||
if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) { | ||
if (BinOp->getOpcode() == BO_Assign) | ||
AssignRHS = BinOp->getRHS(); | ||
} else if (const auto *OpCall = | ||
dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) { | ||
if (OpCall->getOperator() == OO_Equal) | ||
AssignRHS = OpCall->getArg(1); | ||
} | ||
|
||
if (!AssignRHS) | ||
return; | ||
|
||
const Expr *CombinerExpr = AssignRHS->IgnoreParenImpCasts(); | ||
if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CombinerExpr)) | ||
CombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does it do? CombinerExpr is not used anywhere
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not used, redundant code, it was left from prev re-work, Removed.
764d7d0
to
a0d29ab
Compare
c53e227
to
384cd4a
Compare
@alexey-bataev updated few more test with complex. Any more feedback. |
d234100
to
ac02611
Compare
Addressed release-note rebase problem. |
const Expr *InitExpr = VD->getInit(); | ||
if (InitExpr) { | ||
Expr::EvalResult Result; | ||
if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure this is correct. What if the init value is not a constant? You need to emit the initial value and then assign it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For non-const initalizer ,It is handled in the subsequent intialization phase within EmitSharedInit block.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you need such a complex analysis, why you cannot rely on a EmitSharedInit to handle the initialization for const values?
// Handle built-in reduction operations. | ||
const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); | ||
if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr)) | ||
ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); | ||
|
||
const Expr *AssignRHS = nullptr; | ||
if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) { | ||
if (BinOp->getOpcode() == BO_Assign) | ||
AssignRHS = BinOp->getRHS(); | ||
} else if (const auto *OpCall = | ||
dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) { | ||
if (OpCall->getOperator() == OO_Equal) | ||
AssignRHS = OpCall->getArg(1); | ||
} | ||
|
||
if (!AssignRHS) | ||
return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you need this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this was added just for structural validation of ReductionOps.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Then it should be an assertion, not an exit.
ac02611
to
4cb9624
Compare
const Expr *InitExpr = VD->getInit(); | ||
if (InitExpr) { | ||
Expr::EvalResult Result; | ||
if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you need such a complex analysis, why you cannot rely on a EmitSharedInit to handle the initialization for const values?
// Handle built-in reduction operations. | ||
const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); | ||
if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr)) | ||
ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); | ||
|
||
const Expr *AssignRHS = nullptr; | ||
if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) { | ||
if (BinOp->getOpcode() == BO_Assign) | ||
AssignRHS = BinOp->getRHS(); | ||
} else if (const auto *OpCall = | ||
dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) { | ||
if (OpCall->getOperator() == OO_Equal) | ||
AssignRHS = OpCall->getArg(1); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be god to have this check in the debug build only
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-Yes, you are right. EmitSharedInit is capable of handling constant value initialization, current check more of pre-analysis code to set const value for shared variable. I have made the changes and ran some tests as well.
- I made changes to have those checks on debug build only.
99431fb
to
01ca497
Compare
01ca497
to
164f2e8
Compare
Codegen support for reduction over private variable with reduction clause. Section 7.6.10 in in OpenMP 6.0 spec.
Sample Test Case from OpenMP 6.0 Example
Expected Codegen: