1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q)

Summary:
Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated)
indices, like so:

   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)

When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.

This defeats a (hand-coded) optimization that packs several constants into a
single vector and uses the lane intrinsics to reduce register pressure and
trade-off materialising several constants for a single vector load from the
constant pool, like so:

   int16x8_t v = {2,3,4,5,6,7,8,9};
   a = vqdmulh_laneq_s16(a, v, 0);
   b = vqdmulh_laneq_s16(b, v, 1);
   c = vqdmulh_laneq_s16(c, v, 2);
   d = vqdmulh_laneq_s16(d, v, 3);
   [...]

In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4%
performance difference.

We could teach the compiler to recover the lane variants, but this would likely
require its own pass.  (Alternatively, "volatile" could be used on the constants
vector, but this is a bit ugly.)

This patch instead implements the following LLVM IR intrinsics for AArch64 to
maintain the original structure through IR optmization and into instruction
selection:
- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.

These 'lane' variants need an additional register class.  The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.

Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.

This patch also changes clang to emit these IR intrinsics for the corresponding
NEON intrinsics (AArch64 only).

Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma

Reviewed By: efriedma

Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71469
This commit is contained in:
Sanne Wouda 2020-01-29 13:07:15 +00:00
parent d34c0f6368
commit 5c00a1f121
8 changed files with 351 additions and 2 deletions

View File

@ -133,6 +133,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: Intrinsic<[llvm_anyvector_ty],
[LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
[IntrNoMem]>;
class AdvSIMD_2VectorArg_Lane_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
[IntrNoMem]>;
class AdvSIMD_3VectorArg_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
@ -207,9 +211,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
// Vector Saturating Doubling Multiply High
def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
// Vector Saturating Rounding Doubling Multiply High
def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
// Vector Polynominal Multiply
def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;

View File

@ -360,6 +360,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
def UImmS1XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
def UImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
}]>;
@ -7968,6 +7971,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
}
}
multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
SDPatternOperator OpNodeLaneQ> {
def : Pat<(v4i16 (OpNodeLane
(v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v4i16_indexed) $Rn,
(SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
(UImmS1XForm $idx))>;
def : Pat<(v4i16 (OpNodeLaneQ
(v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
VectorIndexH32b:$idx)),
(!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v8i16 (OpNodeLane
(v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v8i16_indexed) $Rn,
(SUBREG_TO_REG (i32 0), $Rm, dsub),
(UImmS1XForm $idx))>;
def : Pat<(v8i16 (OpNodeLaneQ
(v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
VectorIndexH32b:$idx)),
(!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v2i32 (OpNodeLane
(v2i32 V64:$Rn), (v2i32 V64:$Rm),
VectorIndexD32b:$idx)),
(!cast<Instruction>(NAME # v2i32_indexed) $Rn,
(SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
(UImmS1XForm $idx))>;
def : Pat<(v2i32 (OpNodeLaneQ
(v2i32 V64:$Rn), (v4i32 V128:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v4i32 (OpNodeLane
(v4i32 V128:$Rn), (v2i32 V64:$Rm),
VectorIndexD32b:$idx)),
(!cast<Instruction>(NAME # v4i32_indexed) $Rn,
(SUBREG_TO_REG (i32 0), $Rm, dsub),
(UImmS1XForm $idx))>;
def : Pat<(v4i32 (OpNodeLaneQ
(v4i32 V128:$Rn),
(v4i32 V128:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
}
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,

View File

@ -5631,6 +5631,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
int_aarch64_neon_sqdmulh_laneq>;
defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
int_aarch64_neon_sqrdmulh_laneq>;
// Generated by MachineCombine
defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;

View File

@ -230,6 +230,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
case AArch64::FPR64RegClassID:
case AArch64::FPR64_loRegClassID:
case AArch64::FPR128RegClassID:
case AArch64::FPR128_loRegClassID:
case AArch64::DDRegClassID:

View File

@ -596,6 +596,7 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return 32;
case AArch64::FPR128_loRegClassID:
case AArch64::FPR64_loRegClassID:
return 16;
}
}

View File

@ -429,6 +429,10 @@ def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
v1i64, v4f16],
64, (sequence "D%u", 0, 31)>;
def FPR64_lo : RegisterClass<"AArch64",
[v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
64, (trunc FPR64, 16)>;
// We don't (yet) have an f128 legal type, so don't use that here. We
// normalize 128-bit vectors to v2f64 for arg passing and such, so use
// that here.
@ -503,6 +507,9 @@ def VectorRegLoAsmOperand : AsmOperandClass {
let Name = "VectorRegLo";
let PredicateMethod = "isNeonVectorRegLo";
}
def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}

View File

@ -1033,8 +1033,10 @@ public:
bool isNeonVectorRegLo() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
Reg.RegNum);
(AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
Reg.RegNum) ||
AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
Reg.RegNum));
}
template <unsigned Class> bool isSVEVectorReg() const {

View File

@ -9,20 +9,36 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
@ -1515,6 +1531,37 @@ entry:
ret <4 x i16> %vqdmulh2.i
}
define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
ret <4 x i16> %vqdmulh2.i
}
define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
ret <4 x i16> %vqdmulh2.i
}
define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
ret <4 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@ -1527,6 +1574,37 @@ entry:
ret <8 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
ret <8 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
ret <8 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
ret <8 x i16> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1539,6 +1617,37 @@ entry:
ret <2 x i32> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
ret <2 x i32> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
ret <2 x i32> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
ret <2 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1551,6 +1660,37 @@ entry:
ret <4 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16:
; CHECK: // %bb.0: // %entry
@ -1563,6 +1703,37 @@ entry:
ret <4 x i16> %vqrdmulh2.i
}
define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
ret <4 x i16> %vqrdmulh2.i
}
define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
ret <4 x i16> %vqrdmulh2.i
}
define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
ret <4 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@ -1575,6 +1746,37 @@ entry:
ret <8 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
ret <8 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
ret <8 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
ret <8 x i16> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1587,6 +1789,37 @@ entry:
ret <2 x i32> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
ret <2 x i32> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
ret <2 x i32> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
ret <2 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1599,6 +1832,37 @@ entry:
ret <4 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
ret <4 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
ret <4 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
ret <4 x i32> %vqrdmulh2.i
}
define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmul_lane_f32:
; CHECK: // %bb.0: // %entry