[InstCombine] matchFunnelShift - fold or(shl(a,x),lshr(b,sub(bw,x))) -> fshl(a,b,x) iff x < bw (REAPPLIED)

If value tracking can confirm that a shift value is less than the type bitwidth then we can more confidently fold general or(shl(a,x),lshr(b,sub(bw,x))) patterns to a funnel/rotate intrinsic pattern without causing bad codegen regressions in the backend (see D89139). Reapplied after the shift canonicalization in rG02295e6d1a15 which removed the need to flip the shift values. Differential Revision: https://reviews.llvm.org/D88783
2024-11-23 19:23:23 +01:00 · 2020-10-12 16:06:41 +01:00 · 2020-10-12 16:06:41 +01:00 · 371ddf42dc
commit 371ddf42dc
parent fbb71b81b0
3 changed files with 21 additions and 35 deletions
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@ -2053,7 +2053,7 @@ Instruction *InstCombinerImpl::matchBSwap(BinaryOperator &Or) {
 }

 /// Match UB-safe variants of the funnel shift intrinsic.
-static Instruction *matchFunnelShift(Instruction &Or) {
+static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
  // TODO: Can we reduce the code duplication between this and the related
  // rotate matching code under visitSelect and visitTrunc?
  unsigned Width = Or.getType()->getScalarSizeInBits();
@ -2100,6 +2100,16 @@ static Instruction *matchFunnelShift(Instruction &Or) {
        return L;
    }

+    // (shl ShVal, X) | (lshr ShVal, (Width - x)) iff X < Width.
+    // We limit this to X < Width in case the backend re-expands the intrinsic,
+    // and has to reintroduce a shift modulo operation (InstCombine might remove
+    // it after this fold). This still doesn't guarantee that the final codegen
+    // will match this original pattern.
+    if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) {
+      KnownBits KnownL = IC.computeKnownBits(L, /*Depth*/ 0, &Or);
+      return KnownL.getMaxValue().ult(Width) ? L : nullptr;
+    }
+
    // For non-constant cases, the following patterns currently only work for
    // rotation patterns.
    // TODO: Add general funnel-shift compatible patterns.
@ -2593,7 +2603,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
  if (Instruction *BSwap = matchBSwap(I))
    return BSwap;

-  if (Instruction *Funnel = matchFunnelShift(I))
+  if (Instruction *Funnel = matchFunnelShift(I, *this))
    return Funnel;

  if (Instruction *Concat = matchOrConcat(I, Builder))
--- a/test/Transforms/InstCombine/funnel.ll
+++ b/test/Transforms/InstCombine/funnel.ll
@ -168,11 +168,7 @@ define <3 x i36> @fshl_v3i36_constant_nonsplat_undef0(<3 x i36> %x, <3 x i36> %y

 define i64 @fshl_sub_mask(i64 %x, i64 %y, i64 %a) {
 ; CHECK-LABEL: @fshl_sub_mask(
-; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[A:%.*]], 63
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[X:%.*]], [[MASK]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 64, [[MASK]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[Y:%.*]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or i64 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call i64 @llvm.fshl.i64(i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]])
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
  %mask = and i64 %a, 63
@ -187,11 +183,7 @@ define i64 @fshl_sub_mask(i64 %x, i64 %y, i64 %a) {

 define i64 @fshr_sub_mask(i64 %x, i64 %y, i64 %a) {
 ; CHECK-LABEL: @fshr_sub_mask(
-; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[A:%.*]], 63
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[X:%.*]], [[MASK]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 64, [[MASK]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[Y:%.*]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or i64 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call i64 @llvm.fshr.i64(i64 [[Y:%.*]], i64 [[X:%.*]], i64 [[A:%.*]])
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
  %mask = and i64 %a, 63
@ -204,11 +196,7 @@ define i64 @fshr_sub_mask(i64 %x, i64 %y, i64 %a) {

 define <2 x i64> @fshr_sub_mask_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %a) {
 ; CHECK-LABEL: @fshr_sub_mask_vector(
-; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[A:%.*]], <i64 63, i64 63>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i64> [[X:%.*]], [[MASK]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw <2 x i64> <i64 64, i64 64>, [[MASK]]
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i64> [[Y:%.*]], [[SUB]]
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i64> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> [[Y:%.*]], <2 x i64> [[X:%.*]], <2 x i64> [[A:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
  %mask = and <2 x i64> %a, <i64 63, i64 63>
--- a/test/Transforms/InstCombine/rotate.ll
+++ b/test/Transforms/InstCombine/rotate.ll
@ -676,12 +676,8 @@ define i9 @rotateleft_9_neg_mask_wide_amount_commute(i9 %v, i33 %shamt) {

 define i64 @rotl_sub_mask(i64 %0, i64 %1) {
 ; CHECK-LABEL: @rotl_sub_mask(
-; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP1:%.*]], 63
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP0:%.*]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw i64 64, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    ret i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.fshl.i64(i64 [[TMP0:%.*]], i64 [[TMP0]], i64 [[TMP1:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
  %3 = and i64 %1, 63
  %4 = shl i64 %0, %3
@ -695,12 +691,8 @@ define i64 @rotl_sub_mask(i64 %0, i64 %1) {

 define i64 @rotr_sub_mask(i64 %0, i64 %1) {
 ; CHECK-LABEL: @rotr_sub_mask(
-; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP1:%.*]], 63
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP0:%.*]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw i64 64, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    ret i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.fshr.i64(i64 [[TMP0:%.*]], i64 [[TMP0]], i64 [[TMP1:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
  %3 = and i64 %1, 63
  %4 = lshr i64 %0, %3
@ -712,12 +704,8 @@ define i64 @rotr_sub_mask(i64 %0, i64 %1) {

 define <2 x i64> @rotr_sub_mask_vector(<2 x i64> %0, <2 x i64> %1) {
 ; CHECK-LABEL: @rotr_sub_mask_vector(
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP1:%.*]], <i64 63, i64 63>
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP0:%.*]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nuw nsw <2 x i64> <i64 64, i64 64>, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP0]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <2 x i64> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> [[TMP0:%.*]], <2 x i64> [[TMP0]], <2 x i64> [[TMP1:%.*]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
  %3 = and <2 x i64> %1, <i64 63, i64 63>
  %4 = lshr <2 x i64> %0, %3