[RISCV] Add RISCVISD::ROLW/RORW use those for custom legalizing i32 rotl/rotr on RV64IZbb.

This should result in better utilization of RORIW since we don't need to look for a SIGN_EXTEND_INREG that may not exist. Also remove rotl/rotr isel matching to GREVI and just prefer RORI. This is to keep consistency so we don't have to match ROLW/RORW to GREVIW as well. I imagine RORI/RORIW performance will be the same or better than GREVI. Differential Revision: https://reviews.llvm.org/D91449
2024-11-23 03:02:36 +01:00 · 2020-11-20 10:11:34 -08:00 · 2020-11-20 10:11:34 -08:00 · 2a19bf1e6f
commit 2a19bf1e6f
parent 7d25b5a0e9
8 changed files with 58 additions and 110 deletions
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@ -376,62 +376,6 @@ bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
  return true;
 }

-// Check that it is a RORIW (i32 Right Rotate Immediate on RV64).
-// We first check that it is the right node tree:
-//
-//  (SIGN_EXTEND_INREG (OR (SHL RS1, VC2),
-//                         (SRL (AND RS1, VC3), VC1)))
-//
-// Then we check that the constant operands respect these constraints:
-//
-// VC2 == 32 - VC1
-// VC3 | maskTrailingOnes<uint64_t>(VC1) == 0xffffffff
-//
-// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
-// and VC3 being 0xffffffff after accounting for SimplifyDemandedBits removing
-// some bits due to the right shift.
-
-bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-      Subtarget->getXLenVT() == MVT::i64 &&
-      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
-    if (N.getOperand(0).getOpcode() == ISD::OR) {
-      SDValue Or = N.getOperand(0);
-      SDValue Shl = Or.getOperand(0);
-      SDValue Srl = Or.getOperand(1);
-
-      // OR is commutable so canonicalize SHL to LHS.
-      if (Srl.getOpcode() == ISD::SHL)
-        std::swap(Shl, Srl);
-
-      if (Shl.getOpcode() == ISD::SHL && Srl.getOpcode() == ISD::SRL) {
-        if (Srl.getOperand(0).getOpcode() == ISD::AND) {
-          SDValue And = Srl.getOperand(0);
-          if (And.getOperand(0) == Shl.getOperand(0) &&
-              isa<ConstantSDNode>(Srl.getOperand(1)) &&
-              isa<ConstantSDNode>(Shl.getOperand(1)) &&
-              isa<ConstantSDNode>(And.getOperand(1))) {
-            uint64_t VC1 = Srl.getConstantOperandVal(1);
-            uint64_t VC2 = Shl.getConstantOperandVal(1);
-            uint64_t VC3 = And.getConstantOperandVal(1);
-            // The mask needs to be 0xffffffff, but SimplifyDemandedBits may
-            // have removed lower bits that aren't necessary due to the right
-            // shift.
-            if (VC2 == (32 - VC1) &&
-                (VC3 | maskTrailingOnes<uint64_t>(VC1)) == 0xffffffff) {
-              RS1 = Shl.getOperand(0);
-              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
-                                              Srl.getOperand(1).getValueType());
-              return true;
-            }
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 // Merge an ADDI into the offset of a load/store instruction where possible.
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.h
@ -50,7 +50,6 @@ public:
  bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
  bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
  bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
-  bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);

 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@ -151,7 +151,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
  setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);

-  if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) {
+  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::ROTL, MVT::i32, Custom);
+      setOperationAction(ISD::ROTR, MVT::i32, Custom);
+    }
+  } else {
    setOperationAction(ISD::ROTL, XLenVT, Expand);
    setOperationAction(ISD::ROTR, XLenVT, Expand);
  }
@ -908,6 +913,10 @@ static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
    return RISCVISD::DIVUW;
  case ISD::UREM:
    return RISCVISD::REMUW;
+  case ISD::ROTL:
+    return RISCVISD::ROLW;
+  case ISD::ROTR:
+    return RISCVISD::RORW;
  case RISCVISD::GREVI:
    return RISCVISD::GREVIW;
  case RISCVISD::GORCI:
@ -1013,6 +1022,12 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
      return;
    Results.push_back(customLegalizeToWOp(N, DAG));
    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    Results.push_back(customLegalizeToWOp(N, DAG));
+    break;
  case ISD::SDIV:
  case ISD::UDIV:
  case ISD::UREM:
@ -1267,7 +1282,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
  }
  case RISCVISD::SLLW:
  case RISCVISD::SRAW:
-  case RISCVISD::SRLW: {
+  case RISCVISD::SRLW:
+  case RISCVISD::ROLW:
+  case RISCVISD::RORW: {
    // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
    SDValue LHS = N->getOperand(0);
    SDValue RHS = N->getOperand(1);
@ -1392,6 +1409,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
  case RISCVISD::DIVW:
  case RISCVISD::DIVUW:
  case RISCVISD::REMUW:
+  case RISCVISD::ROLW:
+  case RISCVISD::RORW:
  case RISCVISD::GREVIW:
  case RISCVISD::GORCIW:
    // TODO: As the result is sign-extended, this is conservatively correct. A
@ -2829,6 +2848,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(DIVW)
  NODE_NAME_CASE(DIVUW)
  NODE_NAME_CASE(REMUW)
+  NODE_NAME_CASE(ROLW)
+  NODE_NAME_CASE(RORW)
  NODE_NAME_CASE(FMV_W_X_RV64)
  NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
  NODE_NAME_CASE(READ_CYCLE_WIDE)
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@ -42,6 +42,10 @@ enum NodeType : unsigned {
  DIVW,
  DIVUW,
  REMUW,
+  // RV64IB rotates, directly matching the semantics of the named RISC-V
+  // instructions.
+  ROLW,
+  RORW,
  // FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
  // is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
  // FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
--- a/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/lib/Target/RISCV/RISCVInstrInfoB.td
@ -17,6 +17,9 @@
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//

+def riscv_rolw      : SDNode<"RISCVISD::ROLW", SDTIntShiftOp>;
+def riscv_rorw      : SDNode<"RISCVISD::RORW", SDTIntShiftOp>;
+
 def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
  let Name = "UImmLog2XLenHalf";
  let RenderMethod = "addImmOperands";
@ -655,7 +658,6 @@ def SROIPat   : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
 def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
 def SLOIWPat  : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
 def SROIWPat  : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
-def RORIWPat  : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;

 let Predicates = [HasStdExtZbbOrZbp] in {
 def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
@ -724,17 +726,11 @@ def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>;

 let Predicates = [HasStdExtZbp, IsRV32] in {
 def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>;
-// FIXME: Is grev better than rori?
-def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
-def : Pat<(rotr GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
 def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>;
 def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>;
 } // Predicates = [HasStdExtZbp, IsRV32]

 let Predicates = [HasStdExtZbp, IsRV64] in {
-// FIXME: Is grev better than rori?
-def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
-def : Pat<(rotr GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
 def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
 def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
 } // Predicates = [HasStdExtZbp, IsRV64]
@ -890,12 +886,14 @@ def : Pat<(not (riscv_srlw (not GPR:$rs1), GPR:$rs2)),
 } // Predicates = [HasStdExtZbb, IsRV64]

 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-def : Pat<(or (riscv_sllw GPR:$rs1, GPR:$rs2),
-              (riscv_srlw GPR:$rs1, (ineg GPR:$rs2))),
+def : Pat<(riscv_rolw GPR:$rs1, GPR:$rs2),
          (ROLW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (riscv_sllw GPR:$rs1, (ineg GPR:$rs2)),
-              (riscv_srlw GPR:$rs1, GPR:$rs2)),
+def : Pat<(riscv_rorw GPR:$rs1, GPR:$rs2),
          (RORW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_rorw GPR:$rs1, uimm5:$rs2),
+          (RORIW GPR:$rs1, uimm5:$rs2)>;
+def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
+          (RORIW GPR:$rs1, (ImmROTL2RW uimm5:$rs2))>;
 } // Predicates = [HasStdExtZbbOrZbp, IsRV64]

 let Predicates = [HasStdExtZbs, IsRV64] in {
@ -916,10 +914,6 @@ def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
          (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
 } // Predicates = [HasStdExtZbb, IsRV64]

-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
-def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>;
-
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def : Pat<(riscv_greviw GPR:$rs1, timm:$shamt), (GREVIW GPR:$rs1, timm:$shamt)>;
 def : Pat<(riscv_gorciw GPR:$rs1, timm:$shamt), (GORCIW GPR:$rs1, timm:$shamt)>;
--- a/test/CodeGen/RISCV/rv32Zbp.ll
+++ b/test/CodeGen/RISCV/rv32Zbp.ll
@ -1126,12 +1126,12 @@ define i32 @grev16_i32(i32 %a) nounwind {
 ;
 ; RV32IB-LABEL: grev16_i32:
 ; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    rev16 a0, a0
+; RV32IB-NEXT:    rori a0, a0, 16
 ; RV32IB-NEXT:    ret
 ;
 ; RV32IBP-LABEL: grev16_i32:
 ; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    rev16 a0, a0
+; RV32IBP-NEXT:    rori a0, a0, 16
 ; RV32IBP-NEXT:    ret
  %shl = shl i32 %a, 16
  %shr = lshr i32 %a, 16
@ -1152,12 +1152,12 @@ define signext i32 @grev16_i32_fshl(i32 signext %a) nounwind {
 ;
 ; RV32IB-LABEL: grev16_i32_fshl:
 ; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    rev16 a0, a0
+; RV32IB-NEXT:    rori a0, a0, 16
 ; RV32IB-NEXT:    ret
 ;
 ; RV32IBP-LABEL: grev16_i32_fshl:
 ; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    rev16 a0, a0
+; RV32IBP-NEXT:    rori a0, a0, 16
 ; RV32IBP-NEXT:    ret
  %or = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 16)
  ret i32 %or
@ -1173,12 +1173,12 @@ define signext i32 @grev16_i32_fshr(i32 signext %a) nounwind {
 ;
 ; RV32IB-LABEL: grev16_i32_fshr:
 ; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    rev16 a0, a0
+; RV32IB-NEXT:    rori a0, a0, 16
 ; RV32IB-NEXT:    ret
 ;
 ; RV32IBP-LABEL: grev16_i32_fshr:
 ; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    rev16 a0, a0
+; RV32IBP-NEXT:    rori a0, a0, 16
 ; RV32IBP-NEXT:    ret
  %or = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 16)
  ret i32 %or
@ -1197,14 +1197,14 @@ define i64 @grev16_i64(i64 %a) nounwind {
 ;
 ; RV32IB-LABEL: grev16_i64:
 ; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    rev16 a0, a0
-; RV32IB-NEXT:    rev16 a1, a1
+; RV32IB-NEXT:    rori a0, a0, 16
+; RV32IB-NEXT:    rori a1, a1, 16
 ; RV32IB-NEXT:    ret
 ;
 ; RV32IBP-LABEL: grev16_i64:
 ; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    rev16 a0, a0
-; RV32IBP-NEXT:    rev16 a1, a1
+; RV32IBP-NEXT:    rori a0, a0, 16
+; RV32IBP-NEXT:    rori a1, a1, 16
 ; RV32IBP-NEXT:    ret
  %and = shl i64 %a, 16
  %shl = and i64 %and, -281470681808896
--- a/test/CodeGen/RISCV/rv64Zbbp.ll
+++ b/test/CodeGen/RISCV/rv64Zbbp.ll
@ -374,7 +374,6 @@ define signext i32 @rori_i32_fshl(i32 signext %a) nounwind {
 }

 ; Similar to rori_i32_fshl, but doesn't sign extend the result.
-; FIXME: We should be using RORIW, but we need a sext_inreg.
 define void @rori_i32_fshl_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-LABEL: rori_i32_fshl_nosext:
 ; RV64I:       # %bb.0:
@ -386,25 +385,19 @@ define void @rori_i32_fshl_nosext(i32 signext %a, i32* %x) nounwind {
 ;
 ; RV64IB-LABEL: rori_i32_fshl_nosext:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    srliw a2, a0, 1
-; RV64IB-NEXT:    slli a0, a0, 31
-; RV64IB-NEXT:    or a0, a0, a2
+; RV64IB-NEXT:    roriw a0, a0, 1
 ; RV64IB-NEXT:    sw a0, 0(a1)
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBB-LABEL: rori_i32_fshl_nosext:
 ; RV64IBB:       # %bb.0:
-; RV64IBB-NEXT:    srliw a2, a0, 1
-; RV64IBB-NEXT:    slli a0, a0, 31
-; RV64IBB-NEXT:    or a0, a0, a2
+; RV64IBB-NEXT:    roriw a0, a0, 1
 ; RV64IBB-NEXT:    sw a0, 0(a1)
 ; RV64IBB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: rori_i32_fshl_nosext:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    srliw a2, a0, 1
-; RV64IBP-NEXT:    slli a0, a0, 31
-; RV64IBP-NEXT:    or a0, a0, a2
+; RV64IBP-NEXT:    roriw a0, a0, 1
 ; RV64IBP-NEXT:    sw a0, 0(a1)
 ; RV64IBP-NEXT:    ret
  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 31)
@ -440,7 +433,6 @@ define signext i32 @rori_i32_fshr(i32 signext %a) nounwind {
 }

 ; Similar to rori_i32_fshr, but doesn't sign extend the result.
-; FIXME: We should be using RORIW, but we need a sext_inreg.
 define void @rori_i32_fshr_nosext(i32 signext %a, i32* %x) nounwind {
 ; RV64I-LABEL: rori_i32_fshr_nosext:
 ; RV64I:       # %bb.0:
@ -452,25 +444,19 @@ define void @rori_i32_fshr_nosext(i32 signext %a, i32* %x) nounwind {
 ;
 ; RV64IB-LABEL: rori_i32_fshr_nosext:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    slli a2, a0, 1
-; RV64IB-NEXT:    srliw a0, a0, 31
-; RV64IB-NEXT:    or a0, a0, a2
+; RV64IB-NEXT:    roriw a0, a0, 31
 ; RV64IB-NEXT:    sw a0, 0(a1)
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBB-LABEL: rori_i32_fshr_nosext:
 ; RV64IBB:       # %bb.0:
-; RV64IBB-NEXT:    slli a2, a0, 1
-; RV64IBB-NEXT:    srliw a0, a0, 31
-; RV64IBB-NEXT:    or a0, a0, a2
+; RV64IBB-NEXT:    roriw a0, a0, 31
 ; RV64IBB-NEXT:    sw a0, 0(a1)
 ; RV64IBB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: rori_i32_fshr_nosext:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    slli a2, a0, 1
-; RV64IBP-NEXT:    srliw a0, a0, 31
-; RV64IBP-NEXT:    or a0, a0, a2
+; RV64IBP-NEXT:    roriw a0, a0, 31
 ; RV64IBP-NEXT:    sw a0, 0(a1)
 ; RV64IBP-NEXT:    ret
  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 31)
--- a/test/CodeGen/RISCV/rv64Zbp.ll
+++ b/test/CodeGen/RISCV/rv64Zbp.ll
@ -1377,12 +1377,12 @@ define i64 @grev32(i64 %a) nounwind {
 ;
 ; RV64IB-LABEL: grev32:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    rev32 a0, a0
+; RV64IB-NEXT:    rori a0, a0, 32
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: grev32:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    rev32 a0, a0
+; RV64IBP-NEXT:    rori a0, a0, 32
 ; RV64IBP-NEXT:    ret
  %shl = shl i64 %a, 32
  %shr = lshr i64 %a, 32
@ -1403,12 +1403,12 @@ define i64 @grev32_fshl(i64 %a) nounwind {
 ;
 ; RV64IB-LABEL: grev32_fshl:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    rev32 a0, a0
+; RV64IB-NEXT:    rori a0, a0, 32
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: grev32_fshl:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    rev32 a0, a0
+; RV64IBP-NEXT:    rori a0, a0, 32
 ; RV64IBP-NEXT:    ret
  %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 32)
  ret i64 %or
@ -1424,12 +1424,12 @@ define i64 @grev32_fshr(i64 %a) nounwind {
 ;
 ; RV64IB-LABEL: grev32_fshr:
 ; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    rev32 a0, a0
+; RV64IB-NEXT:    rori a0, a0, 32
 ; RV64IB-NEXT:    ret
 ;
 ; RV64IBP-LABEL: grev32_fshr:
 ; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    rev32 a0, a0
+; RV64IBP-NEXT:    rori a0, a0, 32
 ; RV64IBP-NEXT:    ret
  %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 32)
  ret i64 %or