-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[flang][fir] Add affine optimization pass pipeline. #138627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-flang-driver @llvm/pr-subscribers-flang-openmp Author: MingYan (NexMing) ChangesCurrently, the FIR dialect is directly lowered to the LLVM dialect. We can first convert the FIR dialect to the Affine dialect, perform optimizations on top of it, and then lower it to the FIR dialect. The optimization passes are currently experimental, so it's important to actively identify and address issues. Full diff: https://github.com/llvm/llvm-project/pull/138627.diff 6 Files Affected:
diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
index 1cfaf285e75e6..320c561953213 100644
--- a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
+++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
@@ -42,6 +42,7 @@ extern llvm::cl::opt<bool> disableCfgConversion;
extern llvm::cl::opt<bool> disableFirAvc;
extern llvm::cl::opt<bool> disableFirMao;
+extern llvm::cl::opt<bool> enableAffineOpt;
extern llvm::cl::opt<bool> disableFirAliasTags;
extern llvm::cl::opt<bool> useOldAliasTags;
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index a3f59ee8dd013..5c87b1ce609ef 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -18,8 +18,8 @@
#include "flang/Optimizer/Passes/CommandLineOpts.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Tools/CrossToolHelpers.h"
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
#include "mlir/Pass/PassManager.h"
diff --git a/flang/lib/Optimizer/Passes/CMakeLists.txt b/flang/lib/Optimizer/Passes/CMakeLists.txt
index 1c19a5765aff1..ad6c714c28bec 100644
--- a/flang/lib/Optimizer/Passes/CMakeLists.txt
+++ b/flang/lib/Optimizer/Passes/CMakeLists.txt
@@ -21,6 +21,7 @@ add_flang_library(flangPasses
MLIRPass
MLIRReconcileUnrealizedCasts
MLIRSCFToControlFlow
+ MLIRSCFToOpenMP
MLIRSupport
MLIRTransforms
)
diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
index f95a280883cba..b8ae6ede423e3 100644
--- a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
+++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
@@ -55,6 +55,7 @@ cl::opt<bool> useOldAliasTags(
cl::desc("Use a single TBAA tree for all functions and do not use "
"the FIR alias tags pass"),
cl::init(false), cl::Hidden);
+EnableOption(AffineOpt, "affine-opt", "affine optimization");
/// CodeGen Passes
DisableOption(CodeGenRewrite, "codegen-rewrite", "rewrite FIR for codegen");
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index a3ef473ea39b7..e1653cdb1e874 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -211,6 +211,23 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
addNestedPassToAllTopLevelOperations<PassConstructor>(
pm, fir::createStackReclaim);
+
+ if (enableAffineOpt && pc.OptLevel.isOptimizingForSpeed()) {
+ pm.addPass(fir::createPromoteToAffinePass());
+ pm.addPass(mlir::createCSEPass());
+ pm.addPass(mlir::affine::createAffineLoopInvariantCodeMotionPass());
+ pm.addPass(mlir::affine::createAffineLoopNormalizePass());
+ pm.addPass(mlir::affine::createSimplifyAffineStructuresPass());
+ pm.addPass(mlir::affine::createAffineParallelize(
+ mlir::affine::AffineParallelizeOptions{1, false}));
+ pm.addPass(fir::createAffineDemotionPass());
+ pm.addPass(mlir::createLowerAffinePass());
+ if (pc.EnableOpenMP) {
+ pm.addPass(mlir::createConvertSCFToOpenMPPass());
+ pm.addPass(mlir::createCanonicalizerPass());
+ }
+ }
+
// convert control flow to CFG form
fir::addCfgConversionPass(pm, pc);
pm.addPass(mlir::createSCFToControlFlowPass());
diff --git a/flang/test/Lower/OpenMP/auto-omp.f90 b/flang/test/Lower/OpenMP/auto-omp.f90
new file mode 100644
index 0000000000000..d66e6c3f3a3a0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/auto-omp.f90
@@ -0,0 +1,52 @@
+! RUN: %flang_fc1 -O1 -mllvm --enable-affine-opt -emit-llvm -fopenmp -o - %s \
+! RUN: | FileCheck %s
+
+subroutine foo(a)
+ integer, dimension(100, 100), intent(out) :: a
+ a = 1
+end subroutine foo
+
+!CHECK-LABEL: entry:
+!CHECK: %[[VAL_0:.*]] = alloca { ptr }, align 8
+!CHECK: %[[VAL_1:.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1)
+!CHECK: store ptr %[[VAL_2:.*]], ptr %[[VAL_0]], align 8
+!CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull @1, i32 1, ptr nonnull @foo_..omp_par, ptr nonnull %[[VAL_0]])
+!CHECK: ret void
+!CHECK: omp.par.entry:
+!CHECK: %[[VAL_3:.*]] = load ptr, ptr %[[VAL_4:.*]], align 8, !align !3
+!CHECK: %[[VAL_5:.*]] = alloca i32, align 4
+!CHECK: %[[VAL_6:.*]] = alloca i64, align 8
+!CHECK: %[[VAL_7:.*]] = alloca i64, align 8
+!CHECK: %[[VAL_8:.*]] = alloca i64, align 8
+!CHECK: store i64 0, ptr %[[VAL_6]], align 8
+!CHECK: store i64 99, ptr %[[VAL_7]], align 8
+!CHECK: store i64 1, ptr %[[VAL_8]], align 8
+!CHECK: %[[VAL_9:.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @1)
+!CHECK: call void @__kmpc_for_static_init_8u(ptr nonnull @1, i32 %[[VAL_9]], i32 34, ptr nonnull %[[VAL_5]], ptr nonnull %[[VAL_6]], ptr nonnull %[[VAL_7]], ptr nonnull %[[VAL_8]], i64 1, i64 0)
+!CHECK: %[[VAL_10:.*]] = load i64, ptr %[[VAL_6]], align 8
+!CHECK: %[[VAL_11:.*]] = load i64, ptr %[[VAL_7]], align 8
+!CHECK: %[[VAL_12:.*]] = sub i64 %[[VAL_11]], %[[VAL_10]]
+!CHECK: %[[VAL_13:.*]] = icmp eq i64 %[[VAL_12]], -1
+!CHECK: br i1 %[[VAL_13]], label %[[VAL_14:.*]], label %[[VAL_15:.*]]
+!CHECK: omp_loop.exit: ; preds = %[[VAL_16:.*]], %[[VAL_17:.*]]
+!CHECK: call void @__kmpc_for_static_fini(ptr nonnull @1, i32 %[[VAL_9]])
+!CHECK: %[[VAL_18:.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @1)
+!CHECK: call void @__kmpc_barrier(ptr nonnull @2, i32 %[[VAL_18]])
+!CHECK: ret void
+!CHECK: omp_loop.body: ; preds = %[[VAL_17]], %[[VAL_16]]
+!CHECK: %[[VAL_19:.*]] = phi i64 [ %[[VAL_20:.*]], %[[VAL_16]] ], [ 0, %[[VAL_17]] ]
+!CHECK: %[[VAL_21:.*]] = add i64 %[[VAL_19]], %[[VAL_10]]
+!CHECK: %[[VAL_22:.*]] = mul i64 %[[VAL_21]], 400
+!CHECK: %[[VAL_23:.*]] = getelementptr i8, ptr %[[VAL_3]], i64 %[[VAL_22]]
+!CHECK: br label %[[VAL_24:.*]]
+!CHECK: omp_loop.inc: ; preds = %[[VAL_24]]
+!CHECK: %[[VAL_20]] = add nuw i64 %[[VAL_19]], 1
+!CHECK: %[[VAL_25:.*]] = icmp eq i64 %[[VAL_19]], %[[VAL_12]]
+!CHECK: br i1 %[[VAL_25]], label %[[VAL_14]], label %[[VAL_15]]
+!CHECK: omp.loop_nest.region6: ; preds = %[[VAL_15]], %[[VAL_24]]
+!CHECK: %[[VAL_26:.*]] = phi i64 [ 0, %[[VAL_15]] ], [ %[[VAL_27:.*]], %[[VAL_24]] ]
+!CHECK: %[[VAL_28:.*]] = getelementptr i32, ptr %[[VAL_23]], i64 %[[VAL_26]]
+!CHECK: store i32 1, ptr %[[VAL_28]], align 4, !tbaa !4
+!CHECK: %[[VAL_27]] = add nuw nsw i64 %[[VAL_26]], 1
+!CHECK: %[[VAL_29:.*]] = icmp eq i64 %[[VAL_27]], 100
+!CHECK: br i1 %[[VAL_29]], label %[[VAL_16]], label %[[VAL_24]]
|
pm.addPass(mlir::affine::createAffineLoopInvariantCodeMotionPass()); | ||
pm.addPass(mlir::affine::createAffineLoopNormalizePass()); | ||
pm.addPass(mlir::affine::createSimplifyAffineStructuresPass()); | ||
pm.addPass(mlir::affine::createAffineParallelize( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is AffineParallelize
specific for parallelizing to multiple threads or is it also applicable for single-thread transformations as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks
03ead06
to
ea6a6e5
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the update. LGTM, but wait for Kiran's approval too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG. Thanks.
99ecb0b
to
6dfdeaf
Compare
6dfdeaf
to
4e6b42c
Compare
Currently, the FIR dialect is directly lowered to the LLVM dialect. We can first convert the FIR dialect to the Affine dialect, perform optimizations on top of it, and then lower it to the FIR dialect. The optimization passes are currently experimental, so it's important to actively identify and address issues.