diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 2b5488b2e8126..63cf1a5e3f7cf 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_component_library(LLVMVectorize VPlan.cpp VPlanAnalysis.cpp VPlanConstruction.cpp + VPlanPredicator.cpp VPlanRecipes.cpp VPlanSLP.cpp VPlanTransforms.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d6af8a1435d07..00b5b81cc6c96 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8216,185 +8216,6 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { }); } -void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { - BasicBlock *Src = SI->getParent(); - assert(!OrigLoop->isLoopExiting(Src) && - all_of(successors(Src), - [this](BasicBlock *Succ) { - return OrigLoop->getHeader() != Succ; - }) && - "unsupported switch either exiting loop or continuing to header"); - // Create masks where the terminator in Src is a switch. We create mask for - // all edges at the same time. This is more efficient, as we can create and - // collect compares for all cases once. - VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition()); - BasicBlock *DefaultDst = SI->getDefaultDest(); - MapVector> Dst2Compares; - for (auto &C : SI->cases()) { - BasicBlock *Dst = C.getCaseSuccessor(); - assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); - // Cases whose destination is the same as default are redundant and can be - // ignored - they will get there anyhow. - if (Dst == DefaultDst) - continue; - auto &Compares = Dst2Compares[Dst]; - VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); - Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); - } - - // We need to handle 2 separate cases below for all entries in Dst2Compares, - // which excludes destinations matching the default destination. - VPValue *SrcMask = getBlockInMask(Src); - VPValue *DefaultMask = nullptr; - for (const auto &[Dst, Conds] : Dst2Compares) { - // 1. Dst is not the default destination. Dst is reached if any of the cases - // with destination == Dst are taken. Join the conditions for each case - // whose destination == Dst using an OR. - VPValue *Mask = Conds[0]; - for (VPValue *V : ArrayRef(Conds).drop_front()) - Mask = Builder.createOr(Mask, V); - if (SrcMask) - Mask = Builder.createLogicalAnd(SrcMask, Mask); - EdgeMaskCache[{Src, Dst}] = Mask; - - // 2. Create the mask for the default destination, which is reached if none - // of the cases with destination != default destination are taken. Join the - // conditions for each case where the destination is != Dst using an OR and - // negate it. - DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; - } - - if (DefaultMask) { - DefaultMask = Builder.createNot(DefaultMask); - if (SrcMask) - DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); - } - EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; -} - -VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { - assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - - // Look for cached value. - std::pair Edge(Src, Dst); - EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); - if (ECEntryIt != EdgeMaskCache.end()) - return ECEntryIt->second; - - if (auto *SI = dyn_cast(Src->getTerminator())) { - createSwitchEdgeMasks(SI); - assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); - return EdgeMaskCache[Edge]; - } - - VPValue *SrcMask = getBlockInMask(Src); - - // The terminator has to be a branch inst! - BranchInst *BI = dyn_cast(Src->getTerminator()); - assert(BI && "Unexpected terminator found"); - if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) - return EdgeMaskCache[Edge] = SrcMask; - - // If source is an exiting block, we know the exit edge is dynamically dead - // in the vector loop, and thus we don't need to restrict the mask. Avoid - // adding uses of an otherwise potentially dead instruction unless we are - // vectorizing a loop with uncountable exits. In that case, we always - // materialize the mask. - if (OrigLoop->isLoopExiting(Src) && - Src != Legal->getUncountableEarlyExitingBlock()) - return EdgeMaskCache[Edge] = SrcMask; - - VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); - assert(EdgeMask && "No Edge Mask found for condition"); - - if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); - - if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. - // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask - // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' - // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. - EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); - } - - return EdgeMaskCache[Edge] = EdgeMask; -} - -VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { - assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - - // Look for cached value. - std::pair Edge(Src, Dst); - EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); - assert(ECEntryIt != EdgeMaskCache.end() && - "looking up mask for edge which has not been created"); - return ECEntryIt->second; -} - -void VPRecipeBuilder::createHeaderMask() { - BasicBlock *Header = OrigLoop->getHeader(); - - // When not folding the tail, use nullptr to model all-true mask. - if (!CM.foldTailByMasking()) { - BlockMaskCache[Header] = nullptr; - return; - } - - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); - - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - VPValue *BlockMask = nullptr; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); - BlockMaskCache[Header] = BlockMask; -} - -VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { - // Return the cached value. - BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); - assert(BCEntryIt != BlockMaskCache.end() && - "Trying to access mask for block without one."); - return BCEntryIt->second; -} - -void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); - assert(OrigLoop->getHeader() != BB && - "Loop header must have cached block mask"); - - // All-one mask is modelled as no-mask following the convention for masked - // load/store/gather/scatter. Initialize BlockMask to no-mask. - VPValue *BlockMask = nullptr; - // This is the block mask. We OR all unique incoming edges. - for (auto *Predecessor : - SetVector(llvm::from_range, predecessors(BB))) { - VPValue *EdgeMask = createEdgeMask(Predecessor, BB); - if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. - BlockMaskCache[BB] = EdgeMask; - return; - } - - if (!BlockMask) { // BlockMask has its initialized nullptr value. - BlockMask = EdgeMask; - continue; - } - - BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); - } - - BlockMaskCache[BB] = BlockMask; -} - VPWidenMemoryRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range) { @@ -8539,31 +8360,6 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( return nullptr; } -VPBlendRecipe *VPRecipeBuilder::tryToBlend(VPWidenPHIRecipe *PhiR) { - // We know that all PHIs in non-header blocks are converted into selects, so - // we don't have to worry about the insertion order and we can just use the - // builder. At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = PhiR->getNumIncoming(); - SmallVector OperandsWithMask; - for (unsigned In = 0; In < NumIncoming; In++) { - OperandsWithMask.push_back(PhiR->getIncomingValue(In)); - const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); - VPValue *EdgeMask = getEdgeMask(Pred, PhiR->getParent()); - if (!EdgeMask) { - assert(In == 0 && "Both null and non-null edge masks found"); - assert(all_equal(PhiR->operands()) && - "Distinct incoming values with one having a full mask"); - break; - } - OperandsWithMask.push_back(EdgeMask); - } - return new VPBlendRecipe(cast(PhiR->getUnderlyingInstr()), - OperandsWithMask); -} - VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef Operands, VFRange &Range) { @@ -8958,10 +8754,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (auto *PhiR = dyn_cast(R)) { VPBasicBlock *Parent = PhiR->getParent(); VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion(); - // Handle phis in non-header blocks. - if (!LoopRegionOf || LoopRegionOf->getEntry() != Parent) - return tryToBlend(PhiR); - + assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent && + "Non-header phis should have been handled during predication"); auto *Phi = cast(R->getUnderlyingInstr()); assert(Operands.size() == 2 && "Must have 2 operands for header phis"); if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) @@ -9378,8 +9172,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - DenseMap VPB2IRBB; - auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB); + auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); VPlanTransforms::prepareForVectorization( *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck, CM.foldTailByMasking(), OrigLoop, @@ -9412,9 +9205,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, cast(IVInc)->dropPoisonGeneratingFlags(); } - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, - Builder, VPB2IRBB, LVer); - // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further // process after constructing the initial VPlan. @@ -9442,43 +9232,32 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, } // --------------------------------------------------------------------------- - // Construct recipes for the instructions in the loop + // Predicate and linearize the top-level loop region. // --------------------------------------------------------------------------- + DenseMap BlockMaskCache; + VPlanTransforms::predicateAndLinearize(*Plan, CM.foldTailByMasking(), + BlockMaskCache); - VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); - VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); - BasicBlock *HeaderBB = OrigLoop->getHeader(); - bool NeedsMasks = - CM.foldTailByMasking() || - any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { - bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); - return Legal->blockNeedsPredication(BB) || NeedsBlends; - }); - + // --------------------------------------------------------------------------- + // Construct recipes for the instructions in the loop + // --------------------------------------------------------------------------- + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder, BlockMaskCache, LVer); RecipeBuilder.collectScaledReductions(Range); - auto *MiddleVPBB = Plan->getMiddleBlock(); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. + VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); ReversePostOrderTraversal> RPOT( HeaderVPBB); + auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); + // Mapping from VPValues in the initial plan to their widened VPValues. Needed + // temporarily to update created block masks. + DenseMap Old2New; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - // Create mask based on the IR BB corresponding to VPBB. - // TODO: Predicate directly based on VPlan. - Builder.setInsertPoint(VPBB, VPBB->begin()); - if (VPBB == HeaderVPBB) { - Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi()); - RecipeBuilder.createHeaderMask(); - } else if (NeedsMasks) { - // FIXME: At the moment, masks need to be placed at the beginning of the - // block, as blends introduced for phi nodes need to use it. The created - // blends should be sunk after the mask recipes. - RecipeBuilder.createBlockInMask(VPBB); - } - // Convert input VPInstructions to widened recipes. for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *SingleDef = cast(&R); @@ -9488,7 +9267,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // latter are added above for masking. // FIXME: Migrate code relying on the underlying instruction from VPlan0 // to construct recipes below to not use the underlying instruction. - if (isa(&R) || + if (isa( + &R) || (isa(&R) && !UnderlyingValue)) continue; @@ -9497,14 +9277,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, assert((isa(&R) || isa(&R)) && UnderlyingValue && "unsupported recipe"); - if (isa(&R) && - (cast(&R)->getOpcode() == - VPInstruction::BranchOnCond || - (cast(&R)->getOpcode() == Instruction::Switch))) { - R.eraseFromParent(); - break; - } - // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. Instruction *Instr = cast(UnderlyingValue); @@ -9542,26 +9314,24 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, } else { Builder.insert(Recipe); } - if (Recipe->getNumDefinedValues() == 1) + if (Recipe->getNumDefinedValues() == 1) { SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue()); - else + Old2New[SingleDef] = Recipe->getVPSingleValue(); + } else { assert(Recipe->getNumDefinedValues() == 0 && "Unexpected multidef recipe"); - R.eraseFromParent(); + R.eraseFromParent(); + } } } - VPBlockBase *PrevVPBB = nullptr; - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - // Flatten the CFG in the loop. Masks for blocks have already been generated - // and added to recipes as needed. To do so, first disconnect VPBB from its - // successors. Then connect VPBB to the previously visited VPBB. - for (auto *Succ : to_vector(VPBB->getSuccessors())) - VPBlockUtils::disconnectBlocks(VPBB, Succ); - if (PrevVPBB) - VPBlockUtils::connectBlocks(PrevVPBB, VPBB); - PrevVPBB = VPBB; - } + // replaceAllUsesWith above may invalidate the block masks. Update them here. + // TODO: Include the masks as operands in the predicated VPlan directly + // to remove the need to keep a map of masks beyond the predication + // transform. + RecipeBuilder.updateBlockMaskCache(Old2New); + for (const auto &[Old, New] : Old2New) + Old->getDefiningRecipe()->eraseFromParent(); assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && @@ -9690,8 +9460,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - DenseMap VPB2IRBB; - auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB); + auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); VPlanTransforms::prepareForVectorization( *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop, getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false, @@ -9711,8 +9480,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { // Collect mapping of IR header phis to header phi recipes, to be used in // addScalarResumePhis. + DenseMap BlockMaskCache; VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, - Builder, VPB2IRBB, nullptr /*LVer*/); + Builder, BlockMaskCache, nullptr /*LVer*/); for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { if (isa(&R)) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index ae86181487261..264b1ea3deb97 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -68,15 +68,7 @@ class VPRecipeBuilder { VPBuilder &Builder; - /// When we if-convert we need to create edge masks. We have to cache values - /// so that we don't end up with exponential recursion/IR. Note that - /// if-conversion currently takes place during VPlan-construction, so these - /// caches are only used at that stage. - using EdgeMaskCacheTy = - DenseMap, VPValue *>; - using BlockMaskCacheTy = DenseMap; - EdgeMaskCacheTy EdgeMaskCache; - BlockMaskCacheTy BlockMaskCache; + DenseMap &BlockMaskCache; // VPlan construction support: Hold a mapping from ingredients to // their recipe. @@ -90,10 +82,6 @@ class VPRecipeBuilder { /// A mapping of partial reduction exit instructions to their scaling factor. DenseMap ScaledReductionMap; - /// A mapping from VP blocks to IR blocks, used temporarily while migrating - /// away from IR references. - const DenseMap &VPB2IRBB; - /// Loop versioning instance for getting noalias metadata guaranteed by /// runtime checks. LoopVersioning *LVer; @@ -122,11 +110,6 @@ class VPRecipeBuilder { tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef Operands, VFRange &Range); - /// Handle non-loop phi nodes, returning a new VPBlendRecipe. Currently - /// all such phi nodes are turned into a sequence of select instructions as - /// the vectorizer currently performs full if-conversion. - VPBlendRecipe *tryToBlend(VPWidenPHIRecipe *PhiR); - /// Handle call instructions. If \p CI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be /// decreased to ensure same decision from \p Range.Start to \p Range.End. @@ -164,10 +147,11 @@ class VPRecipeBuilder { LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder, - const DenseMap &VPB2IRBB, + DenseMap &BlockMaskCache, LoopVersioning *LVer) : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), - CM(CM), PSE(PSE), Builder(Builder), VPB2IRBB(VPB2IRBB), LVer(LVer) {} + CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache), + LVer(LVer) {} std::optional getScalingForReduction(const Instruction *ExitInst) { auto It = ScaledReductionMap.find(ExitInst); @@ -196,38 +180,10 @@ class VPRecipeBuilder { Ingredient2Recipe[I] = R; } - /// Create the mask for the vector loop header block. - void createHeaderMask(); - - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True or the loop mask when - /// tail folding. - void createBlockInMask(const VPBasicBlock *VPBB) { - return createBlockInMask(VPB2IRBB.lookup(VPBB)); - } - void createBlockInMask(BasicBlock *BB); - - /// Returns the *entry* mask for the block \p VPBB. - VPValue *getBlockInMask(const VPBasicBlock *VPBB) const { - return getBlockInMask(VPB2IRBB.lookup(VPBB)); - } - /// Returns the *entry* mask for the block \p BB. - VPValue *getBlockInMask(BasicBlock *BB) const; - - /// Create an edge mask for every destination of cases and/or default. - void createSwitchEdgeMasks(SwitchInst *SI); - - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - - /// A helper that returns the previously computed predicate of the edge - /// between SRC and DST. - VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { - return getEdgeMask(VPB2IRBB.lookup(Src), VPB2IRBB.lookup(Dst)); + VPValue *getBlockInMask(VPBasicBlock *VPBB) const { + return BlockMaskCache.lookup(VPBB); } - VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const; /// Return the recipe created for given ingredient. VPRecipeBase *getRecipe(Instruction *I) { @@ -252,6 +208,15 @@ class VPRecipeBuilder { } return Plan.getOrAddLiveIn(V); } + + void updateBlockMaskCache(const DenseMap &Old2New) { + for (auto &[_, V] : BlockMaskCache) { + if (auto *New = Old2New.lookup(V)) { + V->replaceAllUsesWith(New); + V = New; + } + } + } }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 287bc93ce496a..92bd49ace3638 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -66,8 +66,7 @@ class PlainCFGBuilder { : TheLoop(Lp), LI(LI), Plan(std::make_unique(Lp)) {} /// Build plain CFG for TheLoop and connects it to Plan's entry. - std::unique_ptr - buildPlainCFG(DenseMap &VPB2IRBB); + std::unique_ptr buildPlainCFG(); }; } // anonymous namespace @@ -242,8 +241,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, } // Main interface to build the plain CFG. -std::unique_ptr PlainCFGBuilder::buildPlainCFG( - DenseMap &VPB2IRBB) { +std::unique_ptr PlainCFGBuilder::buildPlainCFG() { VPIRBasicBlock *Entry = cast(Plan->getEntry()); BB2VPBB[Entry->getIRBasicBlock()] = Entry; for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks()) @@ -334,18 +332,14 @@ std::unique_ptr PlainCFGBuilder::buildPlainCFG( } } - for (const auto &[IRBB, VPB] : BB2VPBB) - VPB2IRBB[VPB] = IRBB; - LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan); return std::move(Plan); } -std::unique_ptr VPlanTransforms::buildPlainCFG( - Loop *TheLoop, LoopInfo &LI, - DenseMap &VPB2IRBB) { +std::unique_ptr VPlanTransforms::buildPlainCFG(Loop *TheLoop, + LoopInfo &LI) { PlainCFGBuilder Builder(TheLoop, &LI); - return Builder.buildPlainCFG(VPB2IRBB); + return Builder.buildPlainCFG(); } /// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp new file mode 100644 index 0000000000000..e0e0509353639 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -0,0 +1,301 @@ +//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements predication for VPlans. +/// +//===----------------------------------------------------------------------===// + +#include "VPRecipeBuilder.h" +#include "VPlan.h" +#include "VPlanCFG.h" +#include "VPlanTransforms.h" +#include "VPlanUtils.h" +#include "llvm/ADT/PostOrderIterator.h" + +using namespace llvm; + +namespace { +class VPPredicator { + using BlockMaskCacheTy = DenseMap; + /// Builder to construct recipes to compute masks. + VPBuilder Builder; + + /// When we if-convert we need to create edge masks. We have to cache values + /// so that we don't end up with exponential recursion/IR. + using EdgeMaskCacheTy = + DenseMap, + VPValue *>; + EdgeMaskCacheTy EdgeMaskCache; + + BlockMaskCacheTy &BlockMaskCache; + + /// Create an edge mask for every destination of cases and/or default. + void createSwitchEdgeMasks(VPInstruction *SI); + + /// Computes and return the predicate of the edge between \p Src and \p Dst. + VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); + + /// Returns the *entry* mask for \p VPBB. + VPValue *getBlockInMask(VPBasicBlock *VPBB) const { + return BlockMaskCache.lookup(VPBB); + } + + void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { + // TODO: Include the masks as operands in the predicated VPlan directly to + // remove the need to keep a map of masks beyond the predication transform. + assert(!getBlockInMask(VPBB) && "Mask already set"); + BlockMaskCache[VPBB] = Mask; + } + + VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst, + VPValue *Mask) { + assert(!getEdgeMask(Src, Dst) && "Mask already set"); + return EdgeMaskCache[{Src, Dst}] = Mask; + } + +public: + VPPredicator(BlockMaskCacheTy &BlockMaskCache) + : BlockMaskCache(BlockMaskCache) {} + + /// Returns the precomputed predicate of the edge from \p Src to \p Dst. + VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { + return EdgeMaskCache.lookup({Src, Dst}); + } + + /// Compute and return the mask for the vector loop header block. + void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail); + + /// Compute and return the predicate of \p VPBB, assuming that the header + /// block of the loop is set to True or the loop mask when tail folding. + VPValue *createBlockInMask(VPBasicBlock *VPBB); +}; +} // namespace + +VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) { + assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge"); + + // Look for cached value. + VPValue *EdgeMask = getEdgeMask(Src, Dst); + if (EdgeMask) + return EdgeMask; + + VPValue *SrcMask = getBlockInMask(Src); + + // If there's a single successor, there's no terminator recipe. + if (Src->getNumSuccessors() == 1) + return setEdgeMask(Src, Dst, SrcMask); + + auto *Term = cast(Src->getTerminator()); + if (Term->getOpcode() == Instruction::Switch) { + createSwitchEdgeMasks(Term); + return getEdgeMask(Src, Dst); + } + + assert(Term->getOpcode() == VPInstruction::BranchOnCond && + "Unsupported terminator"); + if (Src->getSuccessors()[0] == Src->getSuccessors()[1]) + return setEdgeMask(Src, Dst, SrcMask); + + EdgeMask = Term->getOperand(0); + assert(EdgeMask && "No Edge Mask found for condition"); + + if (Src->getSuccessors()[0] != Dst) + EdgeMask = Builder.createNot(EdgeMask, Term->getDebugLoc()); + + if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. + // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask + // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' + // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. + EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, Term->getDebugLoc()); + } + + return setEdgeMask(Src, Dst, EdgeMask); +} + +VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) { + Builder.setInsertPoint(VPBB, VPBB->begin()); + // All-one mask is modelled as no-mask following the convention for masked + // load/store/gather/scatter. Initialize BlockMask to no-mask. + VPValue *BlockMask = nullptr; + // This is the block mask. We OR all unique incoming edges. + for (auto *Predecessor : SetVector( + VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) { + VPValue *EdgeMask = createEdgeMask(cast(Predecessor), VPBB); + if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is + // too. + setBlockInMask(VPBB, EdgeMask); + return EdgeMask; + } + + if (!BlockMask) { // BlockMask has its initial nullptr value. + BlockMask = EdgeMask; + continue; + } + + BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); + } + + setBlockInMask(VPBB, BlockMask); + return BlockMask; +} + +void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) { + if (!FoldTail) { + setBlockInMask(HeaderVPBB, nullptr); + return; + } + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by + // constructing the desired canonical IV in the header block as its first + // non-phi instructions. + + auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); + auto &Plan = *HeaderVPBB->getPlan(); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + HeaderVPBB->insert(IV, NewInsertionPoint); + + Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + setBlockInMask(HeaderVPBB, BlockMask); +} + +void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { + VPBasicBlock *Src = SI->getParent(); + + // Create masks where the terminator in Src is a switch. We create mask for + // all edges at the same time. This is more efficient, as we can create and + // collect compares for all cases once. + VPValue *Cond = SI->getOperand(0); + VPBasicBlock *DefaultDst = cast(Src->getSuccessors()[0]); + MapVector> Dst2Compares; + for (const auto &[Idx, Succ] : + enumerate(ArrayRef(Src->getSuccessors()).drop_front())) { + VPBasicBlock *Dst = cast(Succ); + assert(!getEdgeMask(Src, Dst) && "Edge masks already created"); + // Cases whose destination is the same as default are redundant and can + // be ignored - they will get there anyhow. + if (Dst == DefaultDst) + continue; + auto &Compares = Dst2Compares[Dst]; + VPValue *V = SI->getOperand(Idx + 1); + Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); + } + + // We need to handle 2 separate cases below for all entries in Dst2Compares, + // which excludes destinations matching the default destination. + VPValue *SrcMask = getBlockInMask(Src); + VPValue *DefaultMask = nullptr; + for (const auto &[Dst, Conds] : Dst2Compares) { + // 1. Dst is not the default destination. Dst is reached if any of the + // cases with destination == Dst are taken. Join the conditions for each + // case whose destination == Dst using an OR. + VPValue *Mask = Conds[0]; + for (VPValue *V : ArrayRef(Conds).drop_front()) + Mask = Builder.createOr(Mask, V); + if (SrcMask) + Mask = Builder.createLogicalAnd(SrcMask, Mask); + setEdgeMask(Src, Dst, Mask); + + // 2. Create the mask for the default destination, which is reached if + // none of the cases with destination != default destination are taken. + // Join the conditions for each case where the destination is != Dst using + // an OR and negate it. + DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; + } + + if (DefaultMask) { + DefaultMask = Builder.createNot(DefaultMask); + if (SrcMask) + DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); + } + setEdgeMask(Src, DefaultDst, DefaultMask); +} + +void VPlanTransforms::predicateAndLinearize( + VPlan &Plan, bool FoldTail, + DenseMap &BlockMaskCache) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + // Scan the body of the loop in a topological order to visit each basic block + // after having visited its predecessor basic blocks. + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); + ReversePostOrderTraversal> RPOT( + Header); + VPPredicator Predicator(BlockMaskCache); + for (VPBlockBase *VPB : RPOT) { + // Only regions with only VPBBs are supported at the moment. + auto *VPBB = cast(VPB); + // Introduce the mask for VPBB, which may introduce needed edge masks, and + // convert all phi recipes of VPBB to blend recipes unless VPBB is the + // header. + if (VPBB == Header) { + Predicator.createHeaderMask(Header, FoldTail); + continue; + } + + SmallVector Phis; + for (VPRecipeBase &R : VPBB->phis()) + Phis.push_back(cast(&R)); + + Predicator.createBlockInMask(VPBB); + + for (VPWidenPHIRecipe *PhiR : Phis) { + // The non-header Phi is converted into a Blend recipe below, + // so we don't have to worry about the insertion order and we can just use + // the builder. At this point we generate the predication tree. There may + // be duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + SmallVector OperandsWithMask; + unsigned NumIncoming = PhiR->getNumIncoming(); + for (unsigned In = 0; In < NumIncoming; In++) { + const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); + OperandsWithMask.push_back(PhiR->getIncomingValue(In)); + VPValue *EdgeMask = Predicator.getEdgeMask(Pred, VPBB); + if (!EdgeMask) { + assert(In == 0 && "Both null and non-null edge masks found"); + assert(all_equal(PhiR->operands()) && + "Distinct incoming values with one having a full mask"); + break; + } + OperandsWithMask.push_back(EdgeMask); + } + PHINode *IRPhi = cast(PhiR->getUnderlyingValue()); + auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask); + Blend->insertBefore(PhiR); + PhiR->replaceAllUsesWith(Blend); + PhiR->eraseFromParent(); + } + } + + // Linearize the blocks of the loop into one serial chain. + VPBlockBase *PrevVPBB = nullptr; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + // Handle VPBBs down to the latch. + if (VPBB == LoopRegion->getExiting()) { + if (PrevVPBB) + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + break; + } + + auto Successors = to_vector(VPBB->getSuccessors()); + if (Successors.size() > 1) + VPBB->getTerminator()->eraseFromParent(); + + // Flatten the CFG in the loop. To do so, first disconnect VPBB from its + // successors. Then connect VPBB to the previously visited VPBB. + for (auto *Succ : Successors) + VPBlockUtils::disconnectBlocks(VPBB, Succ); + if (PrevVPBB) + VPBlockUtils::connectBlocks(PrevVPBB, VPBB); + + PrevVPBB = VPBB; + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3a1ed7406b383..62c0be97a3a55 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -53,9 +53,7 @@ struct VPlanTransforms { verifyVPlanIsValid(Plan); } - static std::unique_ptr - buildPlainCFG(Loop *TheLoop, LoopInfo &LI, - DenseMap &VPB2IRBB); + static std::unique_ptr buildPlainCFG(Loop *TheLoop, LoopInfo &LI); /// Prepare the plan for vectorization. It will introduce a dedicated /// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit @@ -224,6 +222,16 @@ struct VPlanTransforms { /// candidates. static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth); + + /// Predicate and linearize the control-flow in the only loop region of + /// \p Plan. If \p FoldTail is true, also create a mask guarding the loop + /// header, otherwise use all-true for the header mask. Masks for blocks are + /// added to \p BlockMaskCache, which in turn will temporarily be used later + /// for wide recipe construction. This argument is temporary and will be + /// removed in the future. + static void + predicateAndLinearize(VPlan &Plan, bool FoldTail, + DenseMap &BlockMaskCache); }; } // namespace llvm diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index 2a15e907e5fa5..e2ad65b93e3dd 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -71,8 +71,7 @@ class VPlanTestIRBase : public testing::Test { Loop *L = LI->getLoopFor(LoopHeader); PredicatedScalarEvolution PSE(*SE, *L); - DenseMap VPB2IRBB; - auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB); + auto Plan = VPlanTransforms::buildPlainCFG(L, *LI); VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2)); VPlanTransforms::prepareForVectorization(*Plan, IntegerType::get(*Ctx, 64), PSE, true, false, L, {}, false, R);