[Hexagon] Restrict compound instructions with constant value.

Having a constant value operand in the compound instruction is not always profitable. This patch improves coremark by ~4% on Hexagon. Differential Revision: https://reviews.llvm.org/D53152 llvm-svn: 344284
2025-01-31 20:51:52 +01:00 · 2018-10-11 19:48:15 +00:00 · 2018-10-11 19:48:15 +00:00 · dbfd952aff
commit dbfd952aff
parent 5658ce69fb
2 changed files with 79 additions and 10 deletions
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@ -257,6 +257,23 @@ class pf2<SDNode Op> : PatFrag<(ops node:$a, node:$b), (Op node:$a, node:$b)>;
 class Not2<PatFrag P>
  : PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;

+// If there is a constant operand that feeds the and/or instruction,
+// do not generate the compound instructions.
+// It is not always profitable, as some times we end up with a transfer.
+// Check the below example.
+// ra = #65820; rb = lsr(rb, #8); rc ^= and (rb, ra)
+// Instead this is preferable.
+// ra = and (#65820, lsr(ra, #8)); rb = xor(rb, ra)
+class Su_ni1<PatFrag Op>
+  : PatFrag<Op.Operands, !head(Op.Fragments), [{
+            if (hasOneUse(N)){
+              // Check if Op1 is an immediate operand.
+              SDValue Op1 = N->getOperand(1);
+              return !dyn_cast<ConstantSDNode>(Op1);
+            }
+            return false;}],
+            Op.OperandTransform>;
+
 class Su<PatFrag Op>
  : PatFrag<Op.Operands, !head(Op.Fragments), [{ return hasOneUse(N); }],
            Op.OperandTransform>;
@ -1336,16 +1353,16 @@ def: Pat<(mul I32:$Rs, n8_0ImmPred:$n8),
 def: Pat<(add Sext64:$Rs, I64:$Rt),
         (A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>;

-def: AccRRR_pat<M4_and_and,   And, Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_and_or,    And, Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_and,    Or,  Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_or,     Or,  Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_and,   Xor, Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_or,    Xor, Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,       I64,  I64,  I64>;
+def: AccRRR_pat<M4_and_and,   And, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_or,    And, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_and,    Or,  Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_or,     Or,  Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_and,   Xor, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_or,    Xor, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,      I64,  I64,  I64>;

 // For dags like (or (and (not _), _), (shl _, _)) where the "or" with
 // one argument matches the patterns below, and with the other argument
--- a/test/CodeGen/Hexagon/constant_compound.ll
+++ b/test/CodeGen/Hexagon/constant_compound.ll
@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon < %s 2>&1 | FileCheck %s
+
+; Generating a compound instruction with a constant is not profitable.
+; The constant needs to be kept in a register before it is fed to compound
+; instruction.
+; Before, we are generating
+; ra = #65820;
+; rb = lsr(rb, #8);
+; rc ^= and (rb, ra)
+; Now, we are generating
+; ra = and (#65820, lsr(ra, #8));
+; rb = xor(rb, ra)
+
+; CHECK: and(##65280,lsr(r
+; CHECK-NOT : ^= and
+
+define dso_local zeroext i16 @test_compound(i16 zeroext %varA, i16 zeroext %varB) local_unnamed_addr #0 {
+entry:
+  %tmp = zext i16 %varB to i32
+  %tmp1 = and i16 %varA, 255
+  %tmp2 = zext i16 %tmp1 to i32
+  %.masked.i = and i32 %tmp, 255
+  %tmp3 = xor i32 %.masked.i, %tmp2
+  %tmp4 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp3, i32 255) #2
+  %tmp5 = trunc i64 %tmp4 to i32
+  %tmp6 = and i32 %tmp5, 255
+  %tmp7 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp6, i32 81922) #2
+  %tmp8 = trunc i64 %tmp7 to i32
+  %tmp9 = xor i32 %tmp8, %tmp
+  %tmp10 = lshr i32 %tmp9, 8
+  %tmp11 = lshr i16 %varA, 8
+  %conv2 = zext i16 %tmp11 to i32
+  %tmp12 = and i32 %tmp10, 65280
+  %.masked.i7 = and i32 %tmp10, 255
+  %tmp13 = xor i32 %.masked.i7, %conv2
+  %tmp14 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp13, i32 255) #2
+  %tmp15 = trunc i64 %tmp14 to i32
+  %tmp16 = and i32 %tmp15, 255
+  %tmp17 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp16, i32 81922) #2
+  %tmp18 = trunc i64 %tmp17 to i32
+  %tmp19 = xor i32 %tmp12, %tmp18
+  %tmp20 = lshr i32 %tmp19, 8
+  %tmp21 = trunc i32 %tmp20 to i16
+  ret i16 %tmp21
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M4.pmpyw(i32, i32) #1
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv65" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }