[AArch64] Peephole optimization: merge AND and TST instructions

In some cases Clang does not perform merging of instructions AND and TST (aka ANDS xzr). Example: tst x2, x1 and x3, x2, x1 to: ands x3, x2, x1 This patch add such merging during instruction selection: when AND is replaced with ANDS instruction in LowerSELECT_CC, all users of AND also should be changed for using this ANDS instruction Short discussion on mailing list: http://llvm.1065342.n5.nabble.com/llvm-dev-ARM-Peephole-optimization-instructions-tst-add-tp133109.html Patch by Pavel Kosov. Differential Revision: https://reviews.llvm.org/D71701
2025-01-31 12:41:49 +01:00 · 2020-02-27 09:23:47 +00:00 · 2020-02-27 09:23:47 +00:00 · ff9f01c06e
commit ff9f01c06e
parent 7f3f3aa402
5 changed files with 103 additions and 17 deletions
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@ -2702,7 +2702,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
    // bits that are implicitly ANDed off by the above opcodes and if so, skip
    // the AND.
    uint64_t MaskImm;
-    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
+        !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
      return false;

    if (countTrailingOnes(MaskImm) < Bits)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -1754,14 +1754,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
    // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
    Opcode = AArch64ISD::ADDS;
    LHS = LHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
-             !isUnsignedIntSetCC(CC)) {
-    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
-    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
-    // of the signed comparisons.
-    Opcode = AArch64ISD::ANDS;
-    RHS = LHS.getOperand(1);
-    LHS = LHS.getOperand(0);
+  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
+    if (LHS.getOpcode() == ISD::AND) {
+      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+      // of the signed comparisons.
+      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
+                                           DAG.getVTList(VT, MVT_CC),
+                                           LHS.getOperand(0),
+                                           LHS.getOperand(1));
+      // Replace all users of (and X, Y) with newly generated (ands X, Y)
+      DAG.ReplaceAllUsesWith(LHS, ANDSNode);
+      return ANDSNode.getValue(1);
+    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
+      // Use result of ANDS
+      return LHS.getValue(1);
+    }
  }

  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
--- a/test/CodeGen/AArch64/funnel-shift.ll
+++ b/test/CodeGen/AArch64/funnel-shift.ll
@ -18,12 +18,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshl_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsl w8, w0, w2
 ; CHECK-NEXT:    lsr w9, w1, w9
 ; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@ -146,12 +145,11 @@ define i8 @fshl_i8_const_fold() {
 define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshr_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsr w8, w1, w2
 ; CHECK-NEXT:    lsl w9, w0, w9
 ; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w1, w8, eq
 ; CHECK-NEXT:    ret
  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
--- a/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/test/CodeGen/AArch64/peephole-and-tst.ll
@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+%struct.anon = type { i32*, i32* }
+
+@ptr_wrapper = common dso_local local_unnamed_addr global %struct.anon* null, align 8
+
+define dso_local i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) local_unnamed_addr {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
+  %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
+  %tobool2 = icmp ne i32 %mask, 0
+  br label %do.body
+
+do.body:                                          ; preds = %4, %entry
+; CHECK-LABEL: test_func_i32_two_uses:
+; CHECK: ands [[DSTREG:w[0-9]+]]
+; Usage #1
+; CHECK: cmp [[DSTREG]]
+; Usage #2
+; CHECK: cbz [[DSTREG]]
+  %bit.addr.0 = phi i32 [ %bit, %entry ], [ %shl, %4 ]
+  %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
+  %and = and i32 %bit.addr.0, %in
+  %tobool = icmp eq i32 %and, 0
+  %not.tobool = xor i1 %tobool, true
+  %inc = zext i1 %not.tobool to i32
+  %retval1.1 = add nuw nsw i32 %retval1.0, %inc
+  %1 = xor i1 %tobool, true
+  %2 = or i1 %tobool2, %1
+  %dummy = and i32 %mask, %in
+  %use_and = icmp eq i32 %and, %dummy
+  %dummy_or = or i1 %use_and, %2
+  br i1 %dummy_or, label %3, label %4
+
+3:                                                ; preds = %do.body
+  store i32* null, i32** %result, align 8
+  br label %4
+
+4:                                                ; preds = %do.body, %3
+  %shl = shl i32 %bit.addr.0, 1
+  %tobool6 = icmp eq i32 %shl, 0
+  br i1 %tobool6, label %do.end, label %do.body
+
+do.end:                                           ; preds = %4
+  ret i32 %retval1.1
+}
+
+define dso_local i32 @test_func_i64_one_use(i64 %in, i64 %bit, i64 %mask) local_unnamed_addr #0 {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
+  %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
+  %tobool2 = icmp ne i64 %mask, 0
+  br label %do.body
+
+do.body:                                          ; preds = %4, %entry
+; CHECK-LABEL: test_func_i64_one_use:
+; CHECK: ands [[DSTREG:x[0-9]+]], [[SRCREG1:x[0-9]+]], [[SRCREG2:x[0-9]+]]
+; CHECK-NEXT: orr [[DSTREG]], [[SRCREG_ORR:x[0-9]+]], [[DSTREG]]
+  %bit.addr.0 = phi i64 [ %bit, %entry ], [ %shl, %4 ]
+  %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
+  %and = and i64 %bit.addr.0, %in
+  %tobool = icmp eq i64 %and, 0
+  %not.tobool = xor i1 %tobool, true
+  %inc = zext i1 %not.tobool to i32
+  %retval1.1 = add nuw nsw i32 %retval1.0, %inc
+  %1 = xor i1 %tobool, true
+  %2 = or i1 %tobool2, %1
+  br i1 %2, label %3, label %4
+
+3:                                                ; preds = %do.body
+  store i32* null, i32** %result, align 8
+  br label %4
+
+4:                                                ; preds = %do.body, %3
+  %shl = shl i64 %bit.addr.0, 1
+  %tobool6 = icmp eq i64 %shl, 0
+  br i1 %tobool6, label %do.end, label %do.body
+
+do.end:                                           ; preds = %4
+  ret i32 %retval1.1
+}
--- a/test/CodeGen/AArch64/shift-by-signext.ll
+++ b/test/CodeGen/AArch64/shift-by-signext.ll
@ -80,12 +80,11 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
 define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n6_fshl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsl w8, w0, w2
 ; CHECK-NEXT:    lsr w9, w1, w9
 ; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
  %shamt_wide = sext i8 %shamt to i32
@ -95,12 +94,11 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n7_fshr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsr w8, w1, w2
 ; CHECK-NEXT:    lsl w9, w0, w9
 ; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w1, w8, eq
 ; CHECK-NEXT:    ret
  %shamt_wide = sext i8 %shamt to i32