1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[AARCH64][NEON] Allow to sink operands of aarch64_neon_pmull64.

Summary:
This patch fixes a problem when pmull2 instruction is not
generated for vmull_high_p64 intrinsic.

ISel has a pattern for int_aarch64_neon_pmull64 intrinsic to generate
PMULL2 instruction. That pattern assumes that extraction operations
are located in the same basic block. We need to sink them
if they are not. Handle operands of int_aarch64_neon_pmull64
into AArch64TargetLowering::shouldSinkOperands.

Reviewed by: efriedma

Differential Revision: https://reviews.llvm.org/D80320
This commit is contained in:
Alexey Lapshin 2020-05-20 21:45:39 +03:00
parent b8040080d8
commit 073a1e428c
2 changed files with 122 additions and 0 deletions

View File

@ -9398,6 +9398,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
return true;
}
/// Check if Op could be used with vmull_high_p64 intrinsic.
static bool isOperandOfVmullHighP64(Value *Op) {
Value *VectorOperand = nullptr;
ConstantInt *ElementIndex = nullptr;
return match(Op, m_ExtractElement(m_Value(VectorOperand),
m_ConstantInt(ElementIndex))) &&
ElementIndex->getValue() == 1 &&
isa<FixedVectorType>(VectorOperand->getType()) &&
cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
}
/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
}
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@ -9414,6 +9430,15 @@ bool AArch64TargetLowering::shouldSinkOperands(
Ops.push_back(&II->getOperandUse(0));
Ops.push_back(&II->getOperandUse(1));
return true;
case Intrinsic::aarch64_neon_pmull64:
if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
II->getArgOperand(1)))
return false;
Ops.push_back(&II->getArgOperandUse(0));
Ops.push_back(&II->getArgOperandUse(1));
return true;
default:
return false;
}

View File

@ -0,0 +1,97 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
; This test checks that pmull2 instruction is used for vmull_high_p64 intrinsic.
; There are two extraction operations located in different basic blocks:
;
; %4 = extractelement <2 x i64> %0, i32 1
; %12 = extractelement <2 x i64> %9, i32 1
;
; They are used by:
;
; @llvm.aarch64.neon.pmull64(i64 %12, i64 %4) #2
;
; We test that pattern replacing llvm.aarch64.neon.pmull64 with pmull2
; would be applied.
; IR for that test was generated from the following .cpp file:
;
; #include <arm_neon.h>
;
; struct SS {
; uint64x2_t x, h;
; };
;
; void func (SS *g, unsigned int count, const unsigned char *buf, poly128_t* res )
; {
; const uint64x2_t x = g->x;
; const uint64x2_t h = g->h;
; uint64x2_t ci = g->x;
;
; for (int i = 0; i < count; i+=2, buf += 16) {
; ci = vreinterpretq_u64_u8(veorq_u8(vreinterpretq_u8_u64(ci),
; vrbitq_u8(vld1q_u8(buf))));
; res[i] = vmull_p64((poly64_t)vget_low_p64(vreinterpretq_p64_u64(ci)),
; (poly64_t)vget_low_p64(vreinterpretq_p64_u64(h)));
; res[i+1] = vmull_high_p64(vreinterpretq_p64_u64(ci),
; vreinterpretq_p64_u64(h));
; }
; }
;CHECK_LABEL: func:
;CHECK: pmull2
%struct.SS = type { <2 x i64>, <2 x i64> }
; Function Attrs: nofree noinline nounwind
define dso_local void @_Z4funcP2SSjPKhPo(%struct.SS* nocapture readonly %g, i32 %count, i8* nocapture readonly %buf, i128* nocapture %res) local_unnamed_addr #0 {
entry:
%h2 = getelementptr inbounds %struct.SS, %struct.SS* %g, i64 0, i32 1
%0 = load <2 x i64>, <2 x i64>* %h2, align 16
%cmp34 = icmp eq i32 %count, 0
br i1 %cmp34, label %for.cond.cleanup, label %for.body.lr.ph
for.body.lr.ph: ; preds = %entry
%1 = bitcast %struct.SS* %g to <16 x i8>*
%2 = load <16 x i8>, <16 x i8>* %1, align 16
%3 = extractelement <2 x i64> %0, i32 0
%4 = extractelement <2 x i64> %0, i32 1
%5 = zext i32 %count to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%buf.addr.036 = phi i8* [ %buf, %for.body.lr.ph ], [ %add.ptr, %for.body ]
%6 = phi <16 x i8> [ %2, %for.body.lr.ph ], [ %xor.i, %for.body ]
%7 = bitcast i8* %buf.addr.036 to <16 x i8>*
%8 = load <16 x i8>, <16 x i8>* %7, align 16
%vrbit.i = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %8) #0
%xor.i = xor <16 x i8> %vrbit.i, %6
%9 = bitcast <16 x i8> %xor.i to <2 x i64>
%10 = extractelement <2 x i64> %9, i32 0
%vmull_p64.i = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %10, i64 %3) #0
%arrayidx = getelementptr inbounds i128, i128* %res, i64 %indvars.iv
%11 = bitcast i128* %arrayidx to <16 x i8>*
store <16 x i8> %vmull_p64.i, <16 x i8>* %11, align 16
%12 = extractelement <2 x i64> %9, i32 1
%vmull_p64.i.i = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %12, i64 %4) #0
%13 = or i64 %indvars.iv, 1
%arrayidx16 = getelementptr inbounds i128, i128* %res, i64 %13
%14 = bitcast i128* %arrayidx16 to <16 x i8>*
store <16 x i8> %vmull_p64.i.i, <16 x i8>* %14, align 16
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
%add.ptr = getelementptr inbounds i8, i8* %buf.addr.036, i64 16
%cmp = icmp ult i64 %indvars.iv.next, %5
br i1 %cmp, label %for.body, label %for.cond.cleanup
}
; Function Attrs: nounwind readnone
declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #0
; Function Attrs: nounwind readnone
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #0
attributes #0 = { nofree noinline nounwind }