1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[AArch64] Peephole optimization: merge AND and TST instructions

In some cases Clang does not perform merging of instructions AND and TST (aka
ANDS xzr).

Example:

  tst x2, x1
  and x3, x2, x1

to:

  ands x3, x2, x1

This patch add such merging during instruction selection: when AND is replaced
with ANDS instruction in LowerSELECT_CC, all users of AND also should be
changed for using this ANDS instruction

Short discussion on mailing list:
http://llvm.1065342.n5.nabble.com/llvm-dev-ARM-Peephole-optimization-instructions-tst-add-tp133109.html

Patch by Pavel Kosov.

Differential Revision: https://reviews.llvm.org/D71701
This commit is contained in:
Sjoerd Meijer 2020-02-27 09:23:47 +00:00
parent 7f3f3aa402
commit ff9f01c06e
5 changed files with 103 additions and 17 deletions

View File

@ -2702,7 +2702,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
// bits that are implicitly ANDed off by the above opcodes and if so, skip
// the AND.
uint64_t MaskImm;
if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
!isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
return false;
if (countTrailingOnes(MaskImm) < Bits)

View File

@ -1754,14 +1754,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
LHS = LHS.getOperand(1);
} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
!isUnsignedIntSetCC(CC)) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
Opcode = AArch64ISD::ANDS;
RHS = LHS.getOperand(1);
LHS = LHS.getOperand(0);
} else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
if (LHS.getOpcode() == ISD::AND) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
DAG.getVTList(VT, MVT_CC),
LHS.getOperand(0),
LHS.getOperand(1));
// Replace all users of (and X, Y) with newly generated (ands X, Y)
DAG.ReplaceAllUsesWith(LHS, ANDSNode);
return ANDSNode.getValue(1);
} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
// Use result of ANDS
return LHS.getValue(1);
}
}
return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)

View File

@ -18,12 +18,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
; CHECK-LABEL: fshl_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: and w9, w2, #0x1f
; CHECK-NEXT: ands w9, w2, #0x1f
; CHECK-NEXT: neg w9, w9
; CHECK-NEXT: lsl w8, w0, w2
; CHECK-NEXT: lsr w9, w1, w9
; CHECK-NEXT: orr w8, w8, w9
; CHECK-NEXT: tst w2, #0x1f
; CHECK-NEXT: csel w0, w0, w8, eq
; CHECK-NEXT: ret
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@ -146,12 +145,11 @@ define i8 @fshl_i8_const_fold() {
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
; CHECK-LABEL: fshr_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: and w9, w2, #0x1f
; CHECK-NEXT: ands w9, w2, #0x1f
; CHECK-NEXT: neg w9, w9
; CHECK-NEXT: lsr w8, w1, w2
; CHECK-NEXT: lsl w9, w0, w9
; CHECK-NEXT: orr w8, w9, w8
; CHECK-NEXT: tst w2, #0x1f
; CHECK-NEXT: csel w0, w1, w8, eq
; CHECK-NEXT: ret
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)

View File

@ -0,0 +1,81 @@
; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
%struct.anon = type { i32*, i32* }
@ptr_wrapper = common dso_local local_unnamed_addr global %struct.anon* null, align 8
define dso_local i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) local_unnamed_addr {
entry:
%0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
%result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
%tobool2 = icmp ne i32 %mask, 0
br label %do.body
do.body: ; preds = %4, %entry
; CHECK-LABEL: test_func_i32_two_uses:
; CHECK: ands [[DSTREG:w[0-9]+]]
; Usage #1
; CHECK: cmp [[DSTREG]]
; Usage #2
; CHECK: cbz [[DSTREG]]
%bit.addr.0 = phi i32 [ %bit, %entry ], [ %shl, %4 ]
%retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
%and = and i32 %bit.addr.0, %in
%tobool = icmp eq i32 %and, 0
%not.tobool = xor i1 %tobool, true
%inc = zext i1 %not.tobool to i32
%retval1.1 = add nuw nsw i32 %retval1.0, %inc
%1 = xor i1 %tobool, true
%2 = or i1 %tobool2, %1
%dummy = and i32 %mask, %in
%use_and = icmp eq i32 %and, %dummy
%dummy_or = or i1 %use_and, %2
br i1 %dummy_or, label %3, label %4
3: ; preds = %do.body
store i32* null, i32** %result, align 8
br label %4
4: ; preds = %do.body, %3
%shl = shl i32 %bit.addr.0, 1
%tobool6 = icmp eq i32 %shl, 0
br i1 %tobool6, label %do.end, label %do.body
do.end: ; preds = %4
ret i32 %retval1.1
}
define dso_local i32 @test_func_i64_one_use(i64 %in, i64 %bit, i64 %mask) local_unnamed_addr #0 {
entry:
%0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
%result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
%tobool2 = icmp ne i64 %mask, 0
br label %do.body
do.body: ; preds = %4, %entry
; CHECK-LABEL: test_func_i64_one_use:
; CHECK: ands [[DSTREG:x[0-9]+]], [[SRCREG1:x[0-9]+]], [[SRCREG2:x[0-9]+]]
; CHECK-NEXT: orr [[DSTREG]], [[SRCREG_ORR:x[0-9]+]], [[DSTREG]]
%bit.addr.0 = phi i64 [ %bit, %entry ], [ %shl, %4 ]
%retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
%and = and i64 %bit.addr.0, %in
%tobool = icmp eq i64 %and, 0
%not.tobool = xor i1 %tobool, true
%inc = zext i1 %not.tobool to i32
%retval1.1 = add nuw nsw i32 %retval1.0, %inc
%1 = xor i1 %tobool, true
%2 = or i1 %tobool2, %1
br i1 %2, label %3, label %4
3: ; preds = %do.body
store i32* null, i32** %result, align 8
br label %4
4: ; preds = %do.body, %3
%shl = shl i64 %bit.addr.0, 1
%tobool6 = icmp eq i64 %shl, 0
br i1 %tobool6, label %do.end, label %do.body
do.end: ; preds = %4
ret i32 %retval1.1
}

View File

@ -80,12 +80,11 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
; CHECK-LABEL: n6_fshl:
; CHECK: // %bb.0:
; CHECK-NEXT: and w9, w2, #0x1f
; CHECK-NEXT: ands w9, w2, #0x1f
; CHECK-NEXT: neg w9, w9
; CHECK-NEXT: lsl w8, w0, w2
; CHECK-NEXT: lsr w9, w1, w9
; CHECK-NEXT: orr w8, w8, w9
; CHECK-NEXT: tst w2, #0x1f
; CHECK-NEXT: csel w0, w0, w8, eq
; CHECK-NEXT: ret
%shamt_wide = sext i8 %shamt to i32
@ -95,12 +94,11 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
; CHECK-LABEL: n7_fshr:
; CHECK: // %bb.0:
; CHECK-NEXT: and w9, w2, #0x1f
; CHECK-NEXT: ands w9, w2, #0x1f
; CHECK-NEXT: neg w9, w9
; CHECK-NEXT: lsr w8, w1, w2
; CHECK-NEXT: lsl w9, w0, w9
; CHECK-NEXT: orr w8, w9, w8
; CHECK-NEXT: tst w2, #0x1f
; CHECK-NEXT: csel w0, w1, w8, eq
; CHECK-NEXT: ret
%shamt_wide = sext i8 %shamt to i32