[AArch64] Lower alias mask to a whilewr

#100579 emits IR that creates a mask disabling lanes that could alias within a loop iteration, based on a pair of pointers. This PR lowers that IR to a WHILEWR instruction for AArch64.
llvm · SamTebbs33 · Aug 2, 2024 · Jun 26, 2024 · Jul 29, 2024 · Jul 29, 2024
commit c943b046e5eeb5faae74783b80137593a43760e6
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -94,6 +94,7 @@
 #include <bitset>
 #include <cassert>
 #include <cctype>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <iterator>
@@ -1523,6 +1524,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      setOperationAction(ISD::OR, VT, Custom);

      setOperationAction(ISD::SELECT_CC, VT, Expand);
      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
@@ -13782,8 +13784,88 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
  return ResultSLI;
 }

+/// Try to lower the construction of a pointer alias mask to a WHILEWR.
+/// The mask's enabled lanes represent the elements that will not overlap across one loop iteration.
+/// This tries to match:
+/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
+///    (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
+SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) {
+  if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE2())
+    return SDValue();
+  auto LaneMask = Op.getOperand(0);
+  auto Splat = Op.getOperand(1);
+
+  if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
+      LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
+      Splat.getOpcode() != ISD::SPLAT_VECTOR)
+    return SDValue();
+
+  auto Cmp = Splat.getOperand(0);
+  if (Cmp.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  CondCodeSDNode *Cond = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2));
+  assert(Cond && "SETCC doesn't have a condition code");
+
+  auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
+  if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
+      Cond->get() != ISD::CondCode::SETLT)
+    return SDValue();
+  unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
+  unsigned EltSize = CompValue + 1;
+  if (!isPowerOf2_64(EltSize) || EltSize > 64)
+    return SDValue();
+
+  auto Diff = Cmp.getOperand(0);
+  if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
+    return SDValue();
+
+  auto LaneMaskConst = dyn_cast<ConstantSDNode>(LaneMask.getOperand(1));
+  if (!LaneMaskConst || LaneMaskConst->getZExtValue() != 0 ||
+      (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
+    return SDValue();
+
+  // An alias mask for i8 elements omits the division because it would just divide by 1
+  if (EltSize > 1) {
+    auto DiffDiv = LaneMask.getOperand(2);
+    auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
+    if (!DiffDivConst || DiffDivConst->getZExtValue() != std::log2(EltSize))
+      return SDValue();
+  } else if (LaneMask.getOperand(2) != Diff)
+    return SDValue();
+
+  auto StorePtr = Diff.getOperand(0);
+  auto ReadPtr = Diff.getOperand(1);
+
+  unsigned IntrinsicID = 0;
+  switch (EltSize) {
+  case 1:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
+    break;
+  case 2:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
+    break;
+  case 4:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
+    break;
+  case 8:
+    IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
+    break;
+  default:
+    return SDValue();
+  }
+  SDLoc DL(Op);
+  SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
+  auto N = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
+                       StorePtr, ReadPtr);
+  return N;
+}
+
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                             SelectionDAG &DAG) const {
+
+  if (SDValue SV = tryWhileWRFromOR(Op, DAG))
+    return SV;
  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
                                   !Subtarget->isNeonAvailable()))
    return LowerToScalableOp(Op, DAG);