[DAG][ARM][MIPS][RISCV] Improve funnel shift promotion to use 'double shift' patterns

Based on a discussion on D88783, if we're promoting a funnel shift to a width at least twice the size as the original type, then we can use the 'double shift' patterns (shifting the concatenated sources). Differential Revision: https://reviews.llvm.org/D89139
2025-01-31 12:41:49 +01:00 · 2020-10-12 14:10:18 +01:00 · 2020-10-12 14:10:18 +01:00 · 132f72d148
commit 132f72d148
parent 6940914260
4 changed files with 55 additions and 55 deletions
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -1129,27 +1129,44 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
  SDValue Lo = GetPromotedInteger(N->getOperand(1));
  SDValue Amount = GetPromotedInteger(N->getOperand(2));

-  unsigned OldBits = N->getOperand(0).getScalarValueSizeInBits();
-  unsigned NewBits = Hi.getScalarValueSizeInBits();
-
-  // Shift Lo up to occupy the upper bits of the promoted type.
  SDLoc DL(N);
+  EVT OldVT = N->getOperand(0).getValueType();
  EVT VT = Lo.getValueType();
-  Lo = DAG.getNode(ISD::SHL, DL, VT, Lo,
-                   DAG.getConstant(NewBits - OldBits, DL, VT));
+  unsigned Opcode = N->getOpcode();
+  bool IsFSHR = Opcode == ISD::FSHR;
+  unsigned OldBits = OldVT.getScalarSizeInBits();
+  unsigned NewBits = VT.getScalarSizeInBits();

  // Amount has to be interpreted modulo the old bit width.
  Amount =
      DAG.getNode(ISD::UREM, DL, VT, Amount, DAG.getConstant(OldBits, DL, VT));

-  unsigned Opcode = N->getOpcode();
-  if (Opcode == ISD::FSHR) {
-    // Increase Amount to shift the result into the lower bits of the promoted
-    // type.
-    Amount = DAG.getNode(ISD::ADD, DL, VT, Amount,
-                         DAG.getConstant(NewBits - OldBits, DL, VT));
+  // If the promoted type is twice the size (or more), then we use the
+  // traditional funnel 'double' shift codegen. This isn't necessary if the
+  // shift amount is constant.
+  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z % bw)) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z % bw)).
+  if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amount) &&
+      !TLI.isOperationLegalOrCustom(Opcode, VT)) {
+    SDValue HiShift = DAG.getConstant(OldBits, DL, VT);
+    Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, HiShift);
+    Lo = DAG.getZeroExtendInReg(Lo, DL, OldVT);
+    SDValue Res = DAG.getNode(ISD::OR, DL, VT, Hi, Lo);
+    Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amount);
+    if (!IsFSHR)
+      Res = DAG.getNode(ISD::SRL, DL, VT, Res, HiShift);
+    return Res;
  }

+  // Shift Lo up to occupy the upper bits of the promoted type.
+  SDValue ShiftOffset = DAG.getConstant(NewBits - OldBits, DL, VT);
+  Lo = DAG.getNode(ISD::SHL, DL, VT, Lo, ShiftOffset);
+
+  // Increase Amount to shift the result into the lower bits of the promoted
+  // type.
+  if (IsFSHR)
+    Amount = DAG.getNode(ISD::ADD, DL, VT, Amount, ShiftOffset);
+
  return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amount);
 }

--- a/test/CodeGen/ARM/funnel-shift.ll
+++ b/test/CodeGen/ARM/funnel-shift.ll
@ -19,13 +19,10 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 define i16 @fshl_i16(i16 %x, i16 %y, i16 %z) {
 ; CHECK-LABEL: fshl_i16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    and r2, r2, #15
-; CHECK-NEXT:    mov r3, #31
-; CHECK-NEXT:    lsl r1, r1, #16
-; CHECK-NEXT:    bic r3, r3, r2
-; CHECK-NEXT:    lsl r0, r0, r2
-; CHECK-NEXT:    lsr r1, r1, #1
-; CHECK-NEXT:    orr r0, r0, r1, lsr r3
+; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
+; CHECK-NEXT:    and r1, r2, #15
+; CHECK-NEXT:    lsl r0, r0, r1
+; CHECK-NEXT:    lsr r0, r0, #16
 ; CHECK-NEXT:    bx lr
  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
  ret i16 %f
@ -188,15 +185,9 @@ define i8 @fshl_i8_const_fold() {
 define i16 @fshr_i16(i16 %x, i16 %y, i16 %z) {
 ; CHECK-LABEL: fshr_i16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r3, #1
-; CHECK-NEXT:    lsl r0, r0, #1
-; CHECK-NEXT:    bfi r2, r3, #4, #28
-; CHECK-NEXT:    mov r3, #31
-; CHECK-NEXT:    bic r3, r3, r2
-; CHECK-NEXT:    and r2, r2, #31
-; CHECK-NEXT:    lsl r1, r1, #16
-; CHECK-NEXT:    lsl r0, r0, r3
-; CHECK-NEXT:    orr r0, r0, r1, lsr r2
+; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
+; CHECK-NEXT:    and r1, r2, #15
+; CHECK-NEXT:    lsr r0, r0, r1
 ; CHECK-NEXT:    bx lr
  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
  ret i16 %f
--- a/test/CodeGen/Mips/funnel-shift.ll
+++ b/test/CodeGen/Mips/funnel-shift.ll
@ -19,15 +19,13 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 define i16 @fshl_i16(i16 %x, i16 %y, i16 %z) {
 ; CHECK-LABEL: fshl_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi $1, $6, 15
-; CHECK-NEXT:    sllv $2, $4, $1
-; CHECK-NEXT:    sll $3, $5, 16
-; CHECK-NEXT:    srl $3, $3, 1
-; CHECK-NEXT:    not $1, $1
-; CHECK-NEXT:    andi $1, $1, 31
-; CHECK-NEXT:    srlv $1, $3, $1
+; CHECK-NEXT:    andi $1, $5, 65535
+; CHECK-NEXT:    sll $2, $4, 16
+; CHECK-NEXT:    or $1, $2, $1
+; CHECK-NEXT:    andi $2, $6, 15
+; CHECK-NEXT:    sllv $1, $1, $2
 ; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    or $2, $2, $1
+; CHECK-NEXT:    srl $2, $1, 16
  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
  ret i16 %f
 }
@ -288,15 +286,12 @@ define i8 @fshl_i8_const_fold() {
 define i16 @fshr_i16(i16 %x, i16 %y, i16 %z) {
 ; CHECK-LABEL: fshr_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    sll $1, $5, 16
+; CHECK-NEXT:    andi $1, $5, 65535
+; CHECK-NEXT:    sll $2, $4, 16
+; CHECK-NEXT:    or $1, $2, $1
 ; CHECK-NEXT:    andi $2, $6, 15
-; CHECK-NEXT:    ori $3, $2, 16
-; CHECK-NEXT:    srlv $1, $1, $3
-; CHECK-NEXT:    sll $3, $4, 1
-; CHECK-NEXT:    xori $2, $2, 15
-; CHECK-NEXT:    sllv $2, $3, $2
 ; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    or $2, $2, $1
+; CHECK-NEXT:    srlv $2, $1, $2
  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
  ret i16 %f
 }
--- a/test/CodeGen/RISCV/rv64Zbt.ll
+++ b/test/CodeGen/RISCV/rv64Zbt.ll
@ -109,14 +109,13 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
 define signext i32 @fshl_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
 ; RV64I-LABEL: fshl_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a2, a2, 31
-; RV64I-NEXT:    sll a0, a0, a2
-; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 1
-; RV64I-NEXT:    srl a1, a1, a2
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    andi a1, a2, 31
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    srai a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64IB-LABEL: fshl_i32:
@ -162,14 +161,12 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
 define signext i32 @fshr_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
 ; RV64I-LABEL: fshr_i32:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    andi a2, a2, 31
-; RV64I-NEXT:    ori a3, a2, 32
-; RV64I-NEXT:    srl a1, a1, a3
-; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    xori a2, a2, 31
-; RV64I-NEXT:    sll a0, a0, a2
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    andi a1, a2, 31
+; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;