1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

Change register allocation order for ARM VFP and NEON registers to put the

callee-saved registers at the end of the lists.  Also prefer to avoid using
the low registers that are in register subclasses required by certain
instructions, so that those registers will more likely be available when needed.
This change makes a huge improvement in spilling in some cases.  Thanks to
Jakob for helping me realize the problem.

Most of this patch is fixing the testsuite.  There are quite a few places
where we're checking for specific registers.  I changed those to wildcards
in places where that doesn't weaken the tests.  The spill-q.ll and
thumb2-spill-q.ll tests stopped spilling with this change, so I added a bunch
of live values to force spills on those tests.

llvm-svn: 116055
This commit is contained in:
Bob Wilson 2010-10-08 06:15:13 +00:00
parent 761f40f7ee
commit 8689a52c10
20 changed files with 235 additions and 111 deletions

View File

@ -387,16 +387,18 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
// VFP3
// VFP3: D8-D15 are callee saved and should be allocated last.
// Save other low registers for use as DPR_VFP2 and DPR_8 classes.
static const unsigned ARM_DPR_VFP3[] = {
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15,
ARM::D16, ARM::D17, ARM::D18, ARM::D19,
ARM::D20, ARM::D21, ARM::D22, ARM::D23,
ARM::D24, ARM::D25, ARM::D26, ARM::D27,
ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
ARM::D28, ARM::D29, ARM::D30, ARM::D31,
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
DPRClass::iterator
DPRClass::allocation_order_begin(const MachineFunction &MF) const {
const TargetMachine &TM = MF.getTarget();
@ -438,6 +440,29 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> {
let SubRegClasses = [(DPR dsub_0, dsub_1)];
let MethodProtos = [{
iterator allocation_order_begin(const MachineFunction &MF) const;
iterator allocation_order_end(const MachineFunction &MF) const;
}];
let MethodBodies = [{
// Q4-Q7 are callee saved and should be allocated last.
// Save other low registers for use as QPR_VFP2 and QPR_8 classes.
static const unsigned ARM_QPR[] = {
ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11,
ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15,
ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7 };
QPRClass::iterator
QPRClass::allocation_order_begin(const MachineFunction &MF) const {
return ARM_QPR;
}
QPRClass::iterator
QPRClass::allocation_order_end(const MachineFunction &MF) const {
return ARM_QPR + (sizeof(ARM_QPR)/sizeof(unsigned));
}
}];
}
// Subset of QPR that have 32-bit SPR subregs.
@ -463,6 +488,27 @@ def QQPR : RegisterClass<"ARM", [v4i64],
[QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> {
let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
(QPR qsub_0, qsub_1)];
let MethodProtos = [{
iterator allocation_order_begin(const MachineFunction &MF) const;
iterator allocation_order_end(const MachineFunction &MF) const;
}];
let MethodBodies = [{
// QQ2-QQ3 are callee saved and should be allocated last.
// Save other low registers for use as QPR_VFP2 and QPR_8 classes.
static const unsigned ARM_QQPR[] = {
ARM::QQ4, ARM::QQ5, ARM::QQ6, ARM::QQ7,
ARM::QQ0, ARM::QQ1, ARM::QQ2, ARM::QQ3 };
QQPRClass::iterator
QQPRClass::allocation_order_begin(const MachineFunction &MF) const {
return ARM_QQPR;
}
QQPRClass::iterator
QQPRClass::allocation_order_end(const MachineFunction &MF) const {
return ARM_QQPR + (sizeof(ARM_QQPR)/sizeof(unsigned));
}
}];
}
// Subset of QQPR that have 32-bit SPR subregs.
@ -483,6 +529,26 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
dsub_4, dsub_5, dsub_6, dsub_7),
(QPR qsub_0, qsub_1, qsub_2, qsub_3)];
let MethodProtos = [{
iterator allocation_order_begin(const MachineFunction &MF) const;
iterator allocation_order_end(const MachineFunction &MF) const;
}];
let MethodBodies = [{
// QQQQ1 is callee saved and should be allocated last.
// Save QQQQ0 for use as QPR_VFP2 and QPR_8 classes.
static const unsigned ARM_QQQQPR[] = {
ARM::QQQQ2, ARM::QQQQ3, ARM::QQQQ0, ARM::QQQQ1 };
QQQQPRClass::iterator
QQQQPRClass::allocation_order_begin(const MachineFunction &MF) const {
return ARM_QQQQPR;
}
QQQQPRClass::iterator
QQQQPRClass::allocation_order_end(const MachineFunction &MF) const {
return ARM_QQQQPR + (sizeof(ARM_QQQQPR)/sizeof(unsigned));
}
}];
}
// Condition code registers.

View File

@ -10,9 +10,9 @@ target triple = "thumbv7-apple-darwin10"
; %reg1028 gets allocated %Q0, and if %reg1030 is reloaded for the partial
; redef, it cannot also get %Q0.
; CHECK: vld1.64 {d0, d1}, [r{{.}}]
; CHECK-NOT: vld1.64 {d0, d1}
; CHECK: vmov.f64 d3, d0
; CHECK: vld1.64 {d16, d17}, [r{{.}}]
; CHECK-NOT: vld1.64 {d16, d17}
; CHECK: vmov.f64 d19, d16
define i32 @test(i8* %arg) nounwind {
entry:

View File

@ -11,7 +11,7 @@ entry:
define double @t2(double %x) nounwind readnone optsize {
entry:
; CHECK: t2:
; CHECK: vmov.f64 d1, #3.000000e+00
; CHECK: vmov.f64 d{{.*}}, #3.000000e+00
%0 = fadd double %x, 3.000000e+00
ret double %0
}
@ -19,7 +19,7 @@ entry:
define double @t3(double %x) nounwind readnone optsize {
entry:
; CHECK: t3:
; CHECK: vmov.f64 d1, #-1.300000e+01
; CHECK: vmov.f64 d{{.*}}, #-1.300000e+01
%0 = fmul double %x, -1.300000e+01
ret double %0
}

View File

@ -7,7 +7,7 @@ define void @t() nounwind {
entry:
; CHECK: vmov.I64 q15, #0
; CHECK: vmov.32 d30[0], r0
; CHECK: vmov q0, q15
; CHECK: vmov q8, q15
%tmp = alloca %struct.int32x4_t, align 16
call void asm sideeffect "vmov.I64 q15, #0\0Avmov.32 d30[0], $1\0Avmov ${0:q}, q15\0A", "=*w,r,~{d31},~{d30}"(%struct.int32x4_t* %tmp, i32 8192) nounwind
ret void
@ -18,7 +18,7 @@ entry:
define void @t2() nounwind {
entry:
; CHECK: vmov d30, d0
; CHECK: vmov d30, d16
; CHECK: vmov.32 r0, d30[0]
%asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
ret void

View File

@ -122,9 +122,9 @@ return1:
return2:
; CHECK: %return2
; CHECK: vadd.i32
; CHECK: vmov q1, q3
; CHECK: vmov q9, q11
; CHECK-NOT: vmov
; CHECK: vst2.32 {d0, d1, d2, d3}
; CHECK: vst2.32 {d16, d17, d18, d19}
%tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
%tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
%tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
@ -136,9 +136,9 @@ return2:
define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
; CHECK: t5:
; CHECK: vldmia
; CHECK: vmov q1, q0
; CHECK: vmov q9, q8
; CHECK-NOT: vmov
; CHECK: vld2.16 {d0[1], d2[1]}, [r0]
; CHECK: vld2.16 {d16[1], d18[1]}, [r0]
; CHECK-NOT: vmov
; CHECK: vadd.i16
%tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1]
@ -153,8 +153,8 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
; CHECK: t6:
; CHECK: vldr.64
; CHECK: vmov d1, d0
; CHECK-NEXT: vld2.8 {d0[1], d1[1]}
; CHECK: vmov d17, d16
; CHECK-NEXT: vld2.8 {d16[1], d17[1]}
%tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2]
%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
@ -168,10 +168,10 @@ entry:
; CHECK: t7:
; CHECK: vld2.32
; CHECK: vst2.32
; CHECK: vld1.32 {d0, d1},
; CHECK: vmov q1, q0
; CHECK: vld1.32 {d16, d17},
; CHECK: vmov q9, q8
; CHECK-NOT: vmov
; CHECK: vuzp.32 q0, q1
; CHECK: vuzp.32 q8, q9
; CHECK: vst1.32
%0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2]
%1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
@ -188,7 +188,7 @@ entry:
; PR7156
define arm_aapcs_vfpcc i32 @t8() nounwind {
; CHECK: t8:
; CHECK: vrsqrte.f32 q0, q0
; CHECK: vrsqrte.f32 q8, q8
bb.nph55.bb.nph55.split_crit_edge:
br label %bb3
@ -238,10 +238,10 @@ bb14: ; preds = %bb6
define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
; CHECK: t9:
; CHECK: vldr.64
; CHECK-NOT: vmov d{{.*}}, d0
; CHECK: vmov.i32 d1
; CHECK-NEXT: vstmia r0, {d0, d1}
; CHECK-NEXT: vstmia r0, {d0, d1}
; CHECK-NOT: vmov d{{.*}}, d16
; CHECK: vmov.i32 d17
; CHECK-NEXT: vstmia r0, {d16, d17}
; CHECK-NEXT: vstmia r0, {d16, d17}
%3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
%4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
store <4 x float> %4, <4 x float>* undef, align 16
@ -269,9 +269,9 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
define arm_aapcs_vfpcc i32 @t10() nounwind {
entry:
; CHECK: t10:
; CHECK: vmov.i32 q1, #0x3F000000
; CHECK: vmov d0, d1
; CHECK: vmla.f32 q0, q0, d0[0]
; CHECK: vmov.i32 q9, #0x3F000000
; CHECK: vmov d0, d17
; CHECK: vmla.f32 q8, q8, d0[0]
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
%2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]

View File

@ -20,6 +20,26 @@ entry:
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
%ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
br label %bb4
@ -44,7 +64,16 @@ bb4: ; preds = %bb193, %entry
%18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
%19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
%20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
%tmp1 = fadd <4 x float> %20, %ld3
%tmp2 = fadd <4 x float> %tmp1, %ld4
%tmp3 = fadd <4 x float> %tmp2, %ld5
%tmp4 = fadd <4 x float> %tmp3, %ld6
%tmp5 = fadd <4 x float> %tmp4, %ld7
%tmp6 = fadd <4 x float> %tmp5, %ld8
%tmp7 = fadd <4 x float> %tmp6, %ld9
%tmp8 = fadd <4 x float> %tmp7, %ld10
%tmp9 = fadd <4 x float> %tmp8, %ld11
%21 = fadd <4 x float> %tmp9, %ld12
%22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
%tmp = extractelement <4 x i1> %22, i32 0
br i1 %tmp, label %bb193, label %bb186

View File

@ -161,9 +161,9 @@ define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
; rdar://7923010
define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vcgt_zext:
;CHECK: vcgt.f32 q0
;CHECK: vmov.i32 q1, #0x1
;CHECK: vand q0, q0, q1
;CHECK: vcgt.f32 q8
;CHECK: vmov.i32 q9, #0x1
;CHECK: vand q8, q8, q9
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2

View File

@ -96,7 +96,7 @@ define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
entry:
; CHECK: vmov.u16 r0, d0[1]
; CHECK: vmov.u16 r0, d{{.*}}[1]
%arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
@ -111,7 +111,7 @@ return: ; preds = %entry
define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
entry:
; CHECK: vmov.u8 r0, d0[1]
; CHECK: vmov.u8 r0, d{{.*}}[1]
%arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
@ -126,7 +126,7 @@ return: ; preds = %entry
define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
entry:
; CHECK: vmov.u16 r0, d0[1]
; CHECK: vmov.u16 r0, d{{.*}}[1]
%arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
@ -141,7 +141,7 @@ return: ; preds = %entry
define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
entry:
; CHECK: vmov.u8 r0, d0[1]
; CHECK: vmov.u8 r0, d{{.*}}[1]
%arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]

View File

@ -3,7 +3,7 @@
define <8 x i8> @vld1i8(i8* %A) nounwind {
;CHECK: vld1i8:
;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld1.8 {d0}, [r0, :64]
;CHECK: vld1.8 {d16}, [r0, :64]
%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
ret <8 x i8> %tmp1
}
@ -43,7 +43,7 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
define <16 x i8> @vld1Qi8(i8* %A) nounwind {
;CHECK: vld1Qi8:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld1.8 {d0, d1}, [r0, :64]
;CHECK: vld1.8 {d16, d17}, [r0, :64]
%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
ret <16 x i8> %tmp1
}
@ -51,7 +51,7 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
define <8 x i16> @vld1Qi16(i16* %A) nounwind {
;CHECK: vld1Qi16:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld1.16 {d0, d1}, [r0, :128]
;CHECK: vld1.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
ret <8 x i16> %tmp1

View File

@ -14,7 +14,7 @@
define <8 x i8> @vld2i8(i8* %A) nounwind {
;CHECK: vld2i8:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld2.8 {d0, d1}, [r0, :64]
;CHECK: vld2.8 {d16, d17}, [r0, :64]
%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
@ -25,7 +25,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind {
define <4 x i16> @vld2i16(i16* %A) nounwind {
;CHECK: vld2i16:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld2.16 {d0, d1}, [r0, :128]
;CHECK: vld2.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
@ -59,7 +59,7 @@ define <2 x float> @vld2f(float* %A) nounwind {
define <1 x i64> @vld2i64(i64* %A) nounwind {
;CHECK: vld2i64:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld1.64 {d0, d1}, [r0, :128]
;CHECK: vld1.64 {d16, d17}, [r0, :128]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
@ -71,7 +71,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
define <16 x i8> @vld2Qi8(i8* %A) nounwind {
;CHECK: vld2Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld2.8 {d0, d1, d2, d3}, [r0, :64]
;CHECK: vld2.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
@ -82,7 +82,7 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
define <8 x i16> @vld2Qi16(i16* %A) nounwind {
;CHECK: vld2Qi16:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld2.16 {d0, d1, d2, d3}, [r0, :128]
;CHECK: vld2.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
@ -94,7 +94,7 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind {
define <4 x i32> @vld2Qi32(i32* %A) nounwind {
;CHECK: vld2Qi32:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld2.32 {d0, d1, d2, d3}, [r0, :256]
;CHECK: vld2.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
%tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0

View File

@ -14,7 +14,7 @@
define <8 x i8> @vld3i8(i8* %A) nounwind {
;CHECK: vld3i8:
;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld3.8 {d0, d1, d2}, [r0, :64]
;CHECK: vld3.8 {d16, d17, d18}, [r0, :64]
%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
%tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
@ -58,7 +58,7 @@ define <2 x float> @vld3f(float* %A) nounwind {
define <1 x i64> @vld3i64(i64* %A) nounwind {
;CHECK: vld3i64:
;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld1.64 {d0, d1, d2}, [r0, :64]
;CHECK: vld1.64 {d16, d17, d18}, [r0, :64]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
@ -70,8 +70,8 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
define <16 x i8> @vld3Qi8(i8* %A) nounwind {
;CHECK: vld3Qi8:
;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld3.8 {d0, d2, d4}, [r0, :64]!
;CHECK: vld3.8 {d1, d3, d5}, [r0, :64]
;CHECK: vld3.8 {d16, d18, d20}, [r0, :64]!
;CHECK: vld3.8 {d17, d19, d21}, [r0, :64]
%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
%tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2

View File

@ -14,7 +14,7 @@
define <8 x i8> @vld4i8(i8* %A) nounwind {
;CHECK: vld4i8:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.8 {d0, d1, d2, d3}, [r0, :64]
;CHECK: vld4.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
@ -25,7 +25,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
define <4 x i16> @vld4i16(i16* %A) nounwind {
;CHECK: vld4i16:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.16 {d0, d1, d2, d3}, [r0, :128]
;CHECK: vld4.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
@ -37,7 +37,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind {
define <2 x i32> @vld4i32(i32* %A) nounwind {
;CHECK: vld4i32:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.32 {d0, d1, d2, d3}, [r0, :256]
;CHECK: vld4.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
@ -60,7 +60,7 @@ define <2 x float> @vld4f(float* %A) nounwind {
define <1 x i64> @vld4i64(i64* %A) nounwind {
;CHECK: vld4i64:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld1.64 {d0, d1, d2, d3}, [r0, :256]
;CHECK: vld1.64 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
%tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
@ -72,8 +72,8 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
define <16 x i8> @vld4Qi8(i8* %A) nounwind {
;CHECK: vld4Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.8 {d0, d2, d4, d6}, [r0, :256]!
;CHECK: vld4.8 {d1, d3, d5, d7}, [r0, :256]
;CHECK: vld4.8 {d16, d18, d20, d22}, [r0, :256]!
;CHECK: vld4.8 {d17, d19, d21, d23}, [r0, :256]
%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
%tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
@ -84,8 +84,8 @@ define <16 x i8> @vld4Qi8(i8* %A) nounwind {
define <8 x i16> @vld4Qi16(i16* %A) nounwind {
;CHECK: vld4Qi16:
;Check for no alignment specifier.
;CHECK: vld4.16 {d0, d2, d4, d6}, [r0]!
;CHECK: vld4.16 {d1, d3, d5, d7}, [r0]
;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0

View File

@ -2,169 +2,169 @@
define <8 x i8> @v_movi8() nounwind {
;CHECK: v_movi8:
;CHECK: vmov.i8 d0, #0x8
;CHECK: vmov.i8 d{{.*}}, #0x8
ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <4 x i16> @v_movi16a() nounwind {
;CHECK: v_movi16a:
;CHECK: vmov.i16 d0, #0x10
;CHECK: vmov.i16 d{{.*}}, #0x10
ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
}
define <4 x i16> @v_movi16b() nounwind {
;CHECK: v_movi16b:
;CHECK: vmov.i16 d0, #0x1000
;CHECK: vmov.i16 d{{.*}}, #0x1000
ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <4 x i16> @v_mvni16a() nounwind {
;CHECK: v_mvni16a:
;CHECK: vmvn.i16 d0, #0x10
;CHECK: vmvn.i16 d{{.*}}, #0x10
ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
}
define <4 x i16> @v_mvni16b() nounwind {
;CHECK: v_mvni16b:
;CHECK: vmvn.i16 d0, #0x1000
;CHECK: vmvn.i16 d{{.*}}, #0x1000
ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
}
define <2 x i32> @v_movi32a() nounwind {
;CHECK: v_movi32a:
;CHECK: vmov.i32 d0, #0x20
;CHECK: vmov.i32 d{{.*}}, #0x20
ret <2 x i32> < i32 32, i32 32 >
}
define <2 x i32> @v_movi32b() nounwind {
;CHECK: v_movi32b:
;CHECK: vmov.i32 d0, #0x2000
;CHECK: vmov.i32 d{{.*}}, #0x2000
ret <2 x i32> < i32 8192, i32 8192 >
}
define <2 x i32> @v_movi32c() nounwind {
;CHECK: v_movi32c:
;CHECK: vmov.i32 d0, #0x200000
;CHECK: vmov.i32 d{{.*}}, #0x200000
ret <2 x i32> < i32 2097152, i32 2097152 >
}
define <2 x i32> @v_movi32d() nounwind {
;CHECK: v_movi32d:
;CHECK: vmov.i32 d0, #0x20000000
;CHECK: vmov.i32 d{{.*}}, #0x20000000
ret <2 x i32> < i32 536870912, i32 536870912 >
}
define <2 x i32> @v_movi32e() nounwind {
;CHECK: v_movi32e:
;CHECK: vmov.i32 d0, #0x20FF
;CHECK: vmov.i32 d{{.*}}, #0x20FF
ret <2 x i32> < i32 8447, i32 8447 >
}
define <2 x i32> @v_movi32f() nounwind {
;CHECK: v_movi32f:
;CHECK: vmov.i32 d0, #0x20FFFF
;CHECK: vmov.i32 d{{.*}}, #0x20FFFF
ret <2 x i32> < i32 2162687, i32 2162687 >
}
define <2 x i32> @v_mvni32a() nounwind {
;CHECK: v_mvni32a:
;CHECK: vmvn.i32 d0, #0x20
;CHECK: vmvn.i32 d{{.*}}, #0x20
ret <2 x i32> < i32 4294967263, i32 4294967263 >
}
define <2 x i32> @v_mvni32b() nounwind {
;CHECK: v_mvni32b:
;CHECK: vmvn.i32 d0, #0x2000
;CHECK: vmvn.i32 d{{.*}}, #0x2000
ret <2 x i32> < i32 4294959103, i32 4294959103 >
}
define <2 x i32> @v_mvni32c() nounwind {
;CHECK: v_mvni32c:
;CHECK: vmvn.i32 d0, #0x200000
;CHECK: vmvn.i32 d{{.*}}, #0x200000
ret <2 x i32> < i32 4292870143, i32 4292870143 >
}
define <2 x i32> @v_mvni32d() nounwind {
;CHECK: v_mvni32d:
;CHECK: vmvn.i32 d0, #0x20000000
;CHECK: vmvn.i32 d{{.*}}, #0x20000000
ret <2 x i32> < i32 3758096383, i32 3758096383 >
}
define <2 x i32> @v_mvni32e() nounwind {
;CHECK: v_mvni32e:
;CHECK: vmvn.i32 d0, #0x20FF
;CHECK: vmvn.i32 d{{.*}}, #0x20FF
ret <2 x i32> < i32 4294958848, i32 4294958848 >
}
define <2 x i32> @v_mvni32f() nounwind {
;CHECK: v_mvni32f:
;CHECK: vmvn.i32 d0, #0x20FFFF
;CHECK: vmvn.i32 d{{.*}}, #0x20FFFF
ret <2 x i32> < i32 4292804608, i32 4292804608 >
}
define <1 x i64> @v_movi64() nounwind {
;CHECK: v_movi64:
;CHECK: vmov.i64 d0, #0xFF0000FF0000FFFF
;CHECK: vmov.i64 d{{.*}}, #0xFF0000FF0000FFFF
ret <1 x i64> < i64 18374687574888349695 >
}
define <16 x i8> @v_movQi8() nounwind {
;CHECK: v_movQi8:
;CHECK: vmov.i8 q0, #0x8
;CHECK: vmov.i8 q{{.*}}, #0x8
ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <8 x i16> @v_movQi16a() nounwind {
;CHECK: v_movQi16a:
;CHECK: vmov.i16 q0, #0x10
;CHECK: vmov.i16 q{{.*}}, #0x10
ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
}
define <8 x i16> @v_movQi16b() nounwind {
;CHECK: v_movQi16b:
;CHECK: vmov.i16 q0, #0x1000
;CHECK: vmov.i16 q{{.*}}, #0x1000
ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <4 x i32> @v_movQi32a() nounwind {
;CHECK: v_movQi32a:
;CHECK: vmov.i32 q0, #0x20
;CHECK: vmov.i32 q{{.*}}, #0x20
ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
}
define <4 x i32> @v_movQi32b() nounwind {
;CHECK: v_movQi32b:
;CHECK: vmov.i32 q0, #0x2000
;CHECK: vmov.i32 q{{.*}}, #0x2000
ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
}
define <4 x i32> @v_movQi32c() nounwind {
;CHECK: v_movQi32c:
;CHECK: vmov.i32 q0, #0x200000
;CHECK: vmov.i32 q{{.*}}, #0x200000
ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
}
define <4 x i32> @v_movQi32d() nounwind {
;CHECK: v_movQi32d:
;CHECK: vmov.i32 q0, #0x20000000
;CHECK: vmov.i32 q{{.*}}, #0x20000000
ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
}
define <4 x i32> @v_movQi32e() nounwind {
;CHECK: v_movQi32e:
;CHECK: vmov.i32 q0, #0x20FF
;CHECK: vmov.i32 q{{.*}}, #0x20FF
ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
}
define <4 x i32> @v_movQi32f() nounwind {
;CHECK: v_movQi32f:
;CHECK: vmov.i32 q0, #0x20FFFF
;CHECK: vmov.i32 q{{.*}}, #0x20FFFF
ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
}
define <2 x i64> @v_movQi64() nounwind {
;CHECK: v_movQi64:
;CHECK: vmov.i64 q0, #0xFF0000FF0000FFFF
;CHECK: vmov.i64 q{{.*}}, #0xFF0000FF0000FFFF
ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
}
@ -173,7 +173,7 @@ define <2 x i64> @v_movQi64() nounwind {
define void @vdupn128(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
entry:
;CHECK: vdupn128:
;CHECK: vmov.i8 d0, #0x80
;CHECK: vmov.i8 d{{.*}}, #0x80
%0 = getelementptr inbounds %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
store <8 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>, <8 x i8>* %0, align 8
ret void
@ -182,7 +182,7 @@ entry:
define void @vdupnneg75(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
entry:
;CHECK: vdupnneg75:
;CHECK: vmov.i8 d0, #0xB5
;CHECK: vmov.i8 d{{.*}}, #0xB5
%0 = getelementptr inbounds %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
store <8 x i8> <i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75>, <8 x i8>* %0, align 8
ret void

View File

@ -3,7 +3,7 @@
define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst1i8:
;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vst1.8 {d0}, [r0, :64]
;CHECK: vst1.8 {d16}, [r0, :64]
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
ret void
@ -48,7 +48,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst1Qi8:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst1.8 {d0, d1}, [r0, :64]
;CHECK: vst1.8 {d16, d17}, [r0, :64]
%tmp1 = load <16 x i8>* %B
call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
ret void
@ -57,7 +57,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst1Qi16:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst1.16 {d0, d1}, [r0, :128]
;CHECK: vst1.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)

View File

@ -3,7 +3,7 @@
define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst2i8:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst2.8 {d0, d1}, [r0, :64]
;CHECK: vst2.8 {d16, d17}, [r0, :64]
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
ret void
@ -12,7 +12,7 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst2i16:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst2.16 {d0, d1}, [r0, :128]
;CHECK: vst2.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
@ -40,7 +40,7 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst2i64:
;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst1.64 {d0, d1}, [r0, :128]
;CHECK: vst1.64 {d16, d17}, [r0, :128]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>* %B
call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
@ -50,7 +50,7 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst2Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst2.8 {d0, d1, d2, d3}, [r0, :64]
;CHECK: vst2.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = load <16 x i8>* %B
call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
ret void
@ -59,7 +59,7 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst2Qi16:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst2.16 {d0, d1, d2, d3}, [r0, :128]
;CHECK: vst2.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
@ -69,7 +69,7 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst2Qi32:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst2.32 {d0, d1, d2, d3}, [r0, :256]
;CHECK: vst2.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>* %B
call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)

View File

@ -3,7 +3,7 @@
define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst4i8:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst4.8 {d0, d1, d2, d3}, [r0, :64]
;CHECK: vst4.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
ret void
@ -12,7 +12,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4i16:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst4.16 {d0, d1, d2, d3}, [r0, :128]
;CHECK: vst4.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
@ -22,7 +22,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst4i32:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst4.32 {d0, d1, d2, d3}, [r0, :256]
;CHECK: vst4.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>* %B
call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
@ -41,7 +41,7 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind {
define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst4i64:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst1.64 {d0, d1, d2, d3}, [r0, :256]
;CHECK: vst1.64 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>* %B
call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
@ -51,8 +51,8 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst4Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst4.8 {d0, d2, d4, d6}, [r0, :256]!
;CHECK: vst4.8 {d1, d3, d5, d7}, [r0, :256]
;CHECK: vst4.8 {d16, d18, d20, d22}, [r0, :256]!
;CHECK: vst4.8 {d17, d19, d21, d23}, [r0, :256]
%tmp1 = load <16 x i8>* %B
call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
ret void
@ -61,8 +61,8 @@ define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst4Qi16:
;Check for no alignment specifier.
;CHECK: vst4.16 {d0, d2, d4, d6}, [r0]!
;CHECK: vst4.16 {d1, d3, d5, d7}, [r0]
;CHECK: vst4.16 {d16, d18, d20, d22}, [r0]!
;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)

View File

@ -23,8 +23,8 @@ entry:
%4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
; Constant pool load followed by add.
; Then clobber the loaded register, not the sum.
; CHECK: vldr.64 [[LDR:d.]]
; CHECK: vadd.f64 [[ADD:d.]], [[LDR]], [[LDR]]
; CHECK: vldr.64 [[LDR:d.*]],
; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]
; CHECK: vmov.f64 [[LDR]]
%5 = fadd <2 x double> %3, %3 ; <<2 x double>> [#uses=2]
%6 = fadd <2 x double> %4, %4 ; <<2 x double>> [#uses=2]

View File

@ -19,6 +19,6 @@ entry:
%0 = fmul double %a, %b
; CORTEXM3: blx ___muldf3
; CORTEXM4: blx ___muldf3
; CORTEXA8: vmul.f64 d0, d1, d0
; CORTEXA8: vmul.f64 d16, d17, d16
ret double %0
}

View File

@ -56,7 +56,7 @@ define void @t2(i8* %ptr1, i8* %ptr2) nounwind {
entry:
; CHECK: t2:
; CHECK: adr r{{.}}, #LCPI1_0
; CHECK: vldmia r3, {d0, d1}
; CHECK: vldmia r3, {d16, d17}
br i1 undef, label %bb1, label %bb2
bb1:

View File

@ -20,6 +20,26 @@ entry:
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
%ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4
%val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
br label %bb4
@ -44,7 +64,16 @@ bb4: ; preds = %bb193, %entry
%18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
%19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
%20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
%tmp1 = fadd <4 x float> %20, %ld3
%tmp2 = fadd <4 x float> %tmp1, %ld4
%tmp3 = fadd <4 x float> %tmp2, %ld5
%tmp4 = fadd <4 x float> %tmp3, %ld6
%tmp5 = fadd <4 x float> %tmp4, %ld7
%tmp6 = fadd <4 x float> %tmp5, %ld8
%tmp7 = fadd <4 x float> %tmp6, %ld9
%tmp8 = fadd <4 x float> %tmp7, %ld10
%tmp9 = fadd <4 x float> %tmp8, %ld11
%21 = fadd <4 x float> %tmp9, %ld12
%22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
%tmp = extractelement <4 x i1> %22, i32 0
br i1 %tmp, label %bb193, label %bb186