1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00
llvm-mirror/test/CodeGen/AArch64/neon-vmull-high-p64.ll
Alexey Lapshin 073a1e428c [AARCH64][NEON] Allow to sink operands of aarch64_neon_pmull64.
Summary:
This patch fixes a problem when pmull2 instruction is not
generated for vmull_high_p64 intrinsic.

ISel has a pattern for int_aarch64_neon_pmull64 intrinsic to generate
PMULL2 instruction. That pattern assumes that extraction operations
are located in the same basic block. We need to sink them
if they are not. Handle operands of int_aarch64_neon_pmull64
into AArch64TargetLowering::shouldSinkOperands.

Reviewed by: efriedma

Differential Revision: https://reviews.llvm.org/D80320
2020-05-22 01:35:24 +03:00

98 lines
3.7 KiB
LLVM

; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
; This test checks that pmull2 instruction is used for vmull_high_p64 intrinsic.
; There are two extraction operations located in different basic blocks:
;
; %4 = extractelement <2 x i64> %0, i32 1
; %12 = extractelement <2 x i64> %9, i32 1
;
; They are used by:
;
; @llvm.aarch64.neon.pmull64(i64 %12, i64 %4) #2
;
; We test that pattern replacing llvm.aarch64.neon.pmull64 with pmull2
; would be applied.
; IR for that test was generated from the following .cpp file:
;
; #include <arm_neon.h>
;
; struct SS {
; uint64x2_t x, h;
; };
;
; void func (SS *g, unsigned int count, const unsigned char *buf, poly128_t* res )
; {
; const uint64x2_t x = g->x;
; const uint64x2_t h = g->h;
; uint64x2_t ci = g->x;
;
; for (int i = 0; i < count; i+=2, buf += 16) {
; ci = vreinterpretq_u64_u8(veorq_u8(vreinterpretq_u8_u64(ci),
; vrbitq_u8(vld1q_u8(buf))));
; res[i] = vmull_p64((poly64_t)vget_low_p64(vreinterpretq_p64_u64(ci)),
; (poly64_t)vget_low_p64(vreinterpretq_p64_u64(h)));
; res[i+1] = vmull_high_p64(vreinterpretq_p64_u64(ci),
; vreinterpretq_p64_u64(h));
; }
; }
;CHECK_LABEL: func:
;CHECK: pmull2
%struct.SS = type { <2 x i64>, <2 x i64> }
; Function Attrs: nofree noinline nounwind
define dso_local void @_Z4funcP2SSjPKhPo(%struct.SS* nocapture readonly %g, i32 %count, i8* nocapture readonly %buf, i128* nocapture %res) local_unnamed_addr #0 {
entry:
%h2 = getelementptr inbounds %struct.SS, %struct.SS* %g, i64 0, i32 1
%0 = load <2 x i64>, <2 x i64>* %h2, align 16
%cmp34 = icmp eq i32 %count, 0
br i1 %cmp34, label %for.cond.cleanup, label %for.body.lr.ph
for.body.lr.ph: ; preds = %entry
%1 = bitcast %struct.SS* %g to <16 x i8>*
%2 = load <16 x i8>, <16 x i8>* %1, align 16
%3 = extractelement <2 x i64> %0, i32 0
%4 = extractelement <2 x i64> %0, i32 1
%5 = zext i32 %count to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%buf.addr.036 = phi i8* [ %buf, %for.body.lr.ph ], [ %add.ptr, %for.body ]
%6 = phi <16 x i8> [ %2, %for.body.lr.ph ], [ %xor.i, %for.body ]
%7 = bitcast i8* %buf.addr.036 to <16 x i8>*
%8 = load <16 x i8>, <16 x i8>* %7, align 16
%vrbit.i = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %8) #0
%xor.i = xor <16 x i8> %vrbit.i, %6
%9 = bitcast <16 x i8> %xor.i to <2 x i64>
%10 = extractelement <2 x i64> %9, i32 0
%vmull_p64.i = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %10, i64 %3) #0
%arrayidx = getelementptr inbounds i128, i128* %res, i64 %indvars.iv
%11 = bitcast i128* %arrayidx to <16 x i8>*
store <16 x i8> %vmull_p64.i, <16 x i8>* %11, align 16
%12 = extractelement <2 x i64> %9, i32 1
%vmull_p64.i.i = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %12, i64 %4) #0
%13 = or i64 %indvars.iv, 1
%arrayidx16 = getelementptr inbounds i128, i128* %res, i64 %13
%14 = bitcast i128* %arrayidx16 to <16 x i8>*
store <16 x i8> %vmull_p64.i.i, <16 x i8>* %14, align 16
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
%add.ptr = getelementptr inbounds i8, i8* %buf.addr.036, i64 16
%cmp = icmp ult i64 %indvars.iv.next, %5
br i1 %cmp, label %for.body, label %for.cond.cleanup
}
; Function Attrs: nounwind readnone
declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #0
; Function Attrs: nounwind readnone
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #0
attributes #0 = { nofree noinline nounwind }