[AARCH64][NEON] Allow to sink operands of aarch64_neon_pmull64.

Summary: This patch fixes a problem when pmull2 instruction is not generated for vmull_high_p64 intrinsic. ISel has a pattern for int_aarch64_neon_pmull64 intrinsic to generate PMULL2 instruction. That pattern assumes that extraction operations are located in the same basic block. We need to sink them if they are not. Handle operands of int_aarch64_neon_pmull64 into AArch64TargetLowering::shouldSinkOperands. Reviewed by: efriedma Differential Revision: https://reviews.llvm.org/D80320
2025-01-31 20:51:52 +01:00 · 2020-05-20 21:45:39 +03:00 · 2020-05-20 21:45:39 +03:00 · 073a1e428c
commit 073a1e428c
parent b8040080d8
2 changed files with 122 additions and 0 deletions
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -9398,6 +9398,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
  return true;
 }

+/// Check if Op could be used with vmull_high_p64 intrinsic.
+static bool isOperandOfVmullHighP64(Value *Op) {
+  Value *VectorOperand = nullptr;
+  ConstantInt *ElementIndex = nullptr;
+  return match(Op, m_ExtractElement(m_Value(VectorOperand),
+                                    m_ConstantInt(ElementIndex))) &&
+         ElementIndex->getValue() == 1 &&
+         isa<FixedVectorType>(VectorOperand->getType()) &&
+         cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
+}
+
+/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
+static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
+  return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@ -9414,6 +9430,15 @@ bool AArch64TargetLowering::shouldSinkOperands(
      Ops.push_back(&II->getOperandUse(0));
      Ops.push_back(&II->getOperandUse(1));
      return true;
+
+    case Intrinsic::aarch64_neon_pmull64:
+      if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
+                                     II->getArgOperand(1)))
+        return false;
+      Ops.push_back(&II->getArgOperandUse(0));
+      Ops.push_back(&II->getArgOperandUse(1));
+      return true;
+
    default:
      return false;
    }
--- a/test/CodeGen/AArch64/neon-vmull-high-p64.ll
+++ b/test/CodeGen/AArch64/neon-vmull-high-p64.ll
@ -0,0 +1,97 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+; This test checks that pmull2 instruction is used for vmull_high_p64 intrinsic.
+; There are two extraction operations located in different basic blocks:
+;
+; %4 = extractelement <2 x i64> %0, i32 1
+; %12 = extractelement <2 x i64> %9, i32 1
+;
+; They are used by:
+;
+; @llvm.aarch64.neon.pmull64(i64 %12, i64 %4) #2
+;
+; We test that pattern replacing llvm.aarch64.neon.pmull64 with pmull2
+; would be applied.
+
+; IR for that test was generated from the following .cpp file:
+;
+; #include <arm_neon.h>
+;
+; struct SS {
+;     uint64x2_t x, h;
+; };
+;
+; void func (SS *g, unsigned int count, const unsigned char *buf, poly128_t* res )
+; {
+;   const uint64x2_t x = g->x;
+;   const uint64x2_t h = g->h;
+;   uint64x2_t ci = g->x;
+;
+;   for (int i = 0; i < count; i+=2, buf += 16) {
+;     ci = vreinterpretq_u64_u8(veorq_u8(vreinterpretq_u8_u64(ci),
+;                                            vrbitq_u8(vld1q_u8(buf))));
+;     res[i] = vmull_p64((poly64_t)vget_low_p64(vreinterpretq_p64_u64(ci)),
+;                        (poly64_t)vget_low_p64(vreinterpretq_p64_u64(h)));
+;     res[i+1] = vmull_high_p64(vreinterpretq_p64_u64(ci),
+;                               vreinterpretq_p64_u64(h));
+;   }
+; }
+
+
+;CHECK_LABEL: func:
+;CHECK: pmull2
+
+%struct.SS = type { <2 x i64>, <2 x i64> }
+
+; Function Attrs: nofree noinline nounwind
+define dso_local void @_Z4funcP2SSjPKhPo(%struct.SS* nocapture readonly %g, i32 %count, i8* nocapture readonly %buf, i128* nocapture %res) local_unnamed_addr #0 {
+entry:
+  %h2 = getelementptr inbounds %struct.SS, %struct.SS* %g, i64 0, i32 1
+  %0 = load <2 x i64>, <2 x i64>* %h2, align 16
+  %cmp34 = icmp eq i32 %count, 0
+  br i1 %cmp34, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = bitcast %struct.SS* %g to <16 x i8>*
+  %2 = load <16 x i8>, <16 x i8>* %1, align 16
+  %3 = extractelement <2 x i64> %0, i32 0
+  %4 = extractelement <2 x i64> %0, i32 1
+  %5 = zext i32 %count to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %buf.addr.036 = phi i8* [ %buf, %for.body.lr.ph ], [ %add.ptr, %for.body ]
+  %6 = phi <16 x i8> [ %2, %for.body.lr.ph ], [ %xor.i, %for.body ]
+  %7 = bitcast i8* %buf.addr.036 to <16 x i8>*
+  %8 = load <16 x i8>, <16 x i8>* %7, align 16
+  %vrbit.i = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %8) #0
+  %xor.i = xor <16 x i8> %vrbit.i, %6
+  %9 = bitcast <16 x i8> %xor.i to <2 x i64>
+  %10 = extractelement <2 x i64> %9, i32 0
+  %vmull_p64.i = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %10, i64 %3) #0
+  %arrayidx = getelementptr inbounds i128, i128* %res, i64 %indvars.iv
+  %11 = bitcast i128* %arrayidx to <16 x i8>*
+  store <16 x i8> %vmull_p64.i, <16 x i8>* %11, align 16
+  %12 = extractelement <2 x i64> %9, i32 1
+  %vmull_p64.i.i = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %12, i64 %4) #0
+  %13 = or i64 %indvars.iv, 1
+  %arrayidx16 = getelementptr inbounds i128, i128* %res, i64 %13
+  %14 = bitcast i128* %arrayidx16 to <16 x i8>*
+  store <16 x i8> %vmull_p64.i.i, <16 x i8>* %14, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %add.ptr = getelementptr inbounds i8, i8* %buf.addr.036, i64 16
+  %cmp = icmp ult i64 %indvars.iv.next, %5
+  br i1 %cmp, label %for.body, label %for.cond.cleanup 
+}
+
+; Function Attrs: nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #0
+
+; Function Attrs: nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #0
+
+attributes #0 = { nofree noinline nounwind }