diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index f1f3d4c6e89..cae4b9babec 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -14641,6 +14641,9 @@ static SDValue CombineBaseUpdate(SDNode *N, case Intrinsic::arm_neon_vld1x2: case Intrinsic::arm_neon_vld1x3: case Intrinsic::arm_neon_vld1x4: + case Intrinsic::arm_neon_vst1x2: + case Intrinsic::arm_neon_vst1x3: + case Intrinsic::arm_neon_vst1x4: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: diff --git a/test/CodeGen/ARM/arm-vst1.ll b/test/CodeGen/ARM/arm-vst1.ll index 3e8f6d76c31..6c9c07cb7c4 100644 --- a/test/CodeGen/ARM/arm-vst1.ll +++ b/test/CodeGen/ARM/arm-vst1.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \ -; RUN: -asm-verbose=false | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs -asm-verbose=false | FileCheck %s ; %struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> } ; %struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } @@ -90,9 +90,10 @@ declare void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8 declare void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>, <16 x i8>) argmemonly nounwind declare void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* nocapture, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) argmemonly nounwind -; CHECK-LABEL: test_vst1_u16_x2 -; CHECK: vst1.16 {d16, d17}, [r0:64] -define void @test_vst1_u16_x2(i16* %a, %struct.uint16x4x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u16_x2(i16* %a, %struct.uint16x4x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u16_x2: +; CHECK: vst1.16 {d0, d1}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x4x2_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x2_t %b, 0, 1 @@ -100,9 +101,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u16_x3 -; CHECK: vst1.16 {d16, d17, d18}, [r0:64] -define void @test_vst1_u16_x3(i16* %a, %struct.uint16x4x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u16_x3(i16* %a, %struct.uint16x4x3_t %b) nounwind { +; CHECK-LABEL: test_vst1_u16_x3: +; CHECK: vst1.16 {d0, d1, d2}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x4x3_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x3_t %b, 0, 1 @@ -111,9 +113,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u16_x4 -; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1_u16_x4(i16* %a, %struct.uint16x4x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u16_x4(i16* %a, %struct.uint16x4x4_t %b) nounwind { +; CHECK-LABEL: test_vst1_u16_x4: +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x4x4_t %b, 0, 0 %b1 = extractvalue %struct.uint16x4x4_t %b, 0, 1 @@ -123,9 +126,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u32_x2 -; CHECK: vst1.32 {d16, d17}, [r0:64] -define void @test_vst1_u32_x2(i32* %a, %struct.uint32x2x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u32_x2(i32* %a, %struct.uint32x2x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u32_x2: +; CHECK: vst1.32 {d0, d1}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x2_t %b, 0, 0 %b1 = extractvalue %struct.uint32x2x2_t %b, 0, 1 @@ -133,9 +137,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u32_x3 -; CHECK: vst1.32 {d16, d17, d18}, [r0:64] -define void @test_vst1_u32_x3(i32* %a, %struct.uint32x2x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u32_x3(i32* %a, %struct.uint32x2x3_t %b) nounwind { +; CHECK-LABEL: test_vst1_u32_x3: +; CHECK: vst1.32 {d0, d1, d2}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x3_t %b, 0, 0 %b1 = extractvalue %struct.uint32x2x3_t %b, 0, 1 @@ -144,9 +149,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u32_x4 -; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1_u32_x4(i32* %a, %struct.uint32x2x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u32_x4(i32* %a, %struct.uint32x2x4_t %b) nounwind { +; CHECK-LABEL: test_vst1_u32_x4: +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x2x4_t %b, 0, 0 %b1 = extractvalue %struct.uint32x2x4_t %b, 0, 1 @@ -156,9 +162,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u64_x2 -; CHECK: vst1.64 {d16, d17}, [r0:64] -define void @test_vst1_u64_x2(i64* %a, %struct.uint64x1x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u64_x2(i64* %a, %struct.uint64x1x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u64_x2: +; CHECK: vst1.64 {d0, d1}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x2_t %b, 0, 0 %b1 = extractvalue %struct.uint64x1x2_t %b, 0, 1 @@ -166,9 +173,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u64_x3 -; CHECK: vst1.64 {d16, d17, d18}, [r0:64] -define void @test_vst1_u64_x3(i64* %a, %struct.uint64x1x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u64_x3(i64* %a, %struct.uint64x1x3_t %b) nounwind { +; CHECK-LABEL: test_vst1_u64_x3: +; CHECK: vst1.64 {d0, d1, d2}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x3_t %b, 0, 0 %b1 = extractvalue %struct.uint64x1x3_t %b, 0, 1 @@ -177,9 +185,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u64_x4 -; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1_u64_x4(i64* %a, %struct.uint64x1x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u64_x4(i64* %a, %struct.uint64x1x4_t %b) nounwind { +; CHECK-LABEL: test_vst1_u64_x4: +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x1x4_t %b, 0, 0 %b1 = extractvalue %struct.uint64x1x4_t %b, 0, 1 @@ -189,9 +198,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u8_x2 -; CHECK: vst1.8 {d16, d17}, [r0:64] -define void @test_vst1_u8_x2(i8* %a, %struct.uint8x8x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u8_x2(i8* %a, %struct.uint8x8x2_t %b) nounwind { +; CHECK-LABEL: test_vst1_u8_x2: +; CHECK: vst1.8 {d0, d1}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x2_t %b, 0, 0 %b1 = extractvalue %struct.uint8x8x2_t %b, 0, 1 @@ -199,9 +209,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u8_x3 -; CHECK: vst1.8 {d16, d17, d18}, [r0:64] -define void @test_vst1_u8_x3(i8* %a, %struct.uint8x8x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u8_x3(i8* %a, %struct.uint8x8x3_t %b) nounwind { +; CHECK-LABEL: test_vst1_u8_x3: +; CHECK: vst1.8 {d0, d1, d2}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x3_t %b, 0, 0 %b1 = extractvalue %struct.uint8x8x3_t %b, 0, 1 @@ -210,9 +221,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1_u8_x4 -; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1_u8_x4(i8* %a, %struct.uint8x8x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1_u8_x4(i8* %a, %struct.uint8x8x4_t %b) nounwind { +; CHECK-LABEL: test_vst1_u8_x4: +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x8x4_t %b, 0, 0 %b1 = extractvalue %struct.uint8x8x4_t %b, 0, 1 @@ -222,9 +234,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u16_x2 -; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1q_u16_x2(i16* %a, %struct.uint16x8x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u16_x2(i16* %a, %struct.uint16x8x2_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u16_x2: +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x2_t %b, 0, 0 %b1 = extractvalue %struct.uint16x8x2_t %b, 0, 1 @@ -232,10 +245,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u16_x3 -; CHECK: vst1.16 {d16, d17, d18}, [r0:64]! -; CHECK: vst1.16 {d19, d20, d21}, [r0:64] -define void @test_vst1q_u16_x3(i16* %a, %struct.uint16x8x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u16_x3(i16* %a, %struct.uint16x8x3_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u16_x3: +; CHECK: vst1.16 {d0, d1, d2}, [r0:64]! +; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x3_t %b, 0, 0 %b1 = extractvalue %struct.uint16x8x3_t %b, 0, 1 @@ -244,10 +258,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u16_x4 -; CHECK: vst1.16 {d16, d17, d18, d19}, [r0:256]! -; CHECK: vst1.16 {d20, d21, d22, d23}, [r0:256] -define void @test_vst1q_u16_x4(i16* %a, %struct.uint16x8x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u16_x4(i16* %a, %struct.uint16x8x4_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u16_x4: +; CHECK: vst1.16 {d0, d1, d2, d3}, [r0:256]! +; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint16x8x4_t %b, 0, 0 %b1 = extractvalue %struct.uint16x8x4_t %b, 0, 1 @@ -257,9 +272,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u32_x2 -; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1q_u32_x2(i32* %a, %struct.uint32x4x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u32_x2(i32* %a, %struct.uint32x4x2_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u32_x2: +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x2_t %b, 0, 0 %b1 = extractvalue %struct.uint32x4x2_t %b, 0, 1 @@ -267,10 +283,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u32_x3 -; CHECK: vst1.32 {d16, d17, d18}, [r0:64]! -; CHECK: vst1.32 {d19, d20, d21}, [r0:64] -define void @test_vst1q_u32_x3(i32* %a, %struct.uint32x4x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u32_x3(i32* %a, %struct.uint32x4x3_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u32_x3: +; CHECK: vst1.32 {d0, d1, d2}, [r0:64]! +; CHECK-NEXT: vst1.32 {d3, d4, d5}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x3_t %b, 0, 0 %b1 = extractvalue %struct.uint32x4x3_t %b, 0, 1 @@ -279,10 +296,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u32_x4 -; CHECK: vst1.32 {d16, d17, d18, d19}, [r0:256]! -; CHECK: vst1.32 {d20, d21, d22, d23}, [r0:256] -define void @test_vst1q_u32_x4(i32* %a, %struct.uint32x4x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u32_x4(i32* %a, %struct.uint32x4x4_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u32_x4: +; CHECK: vst1.32 {d0, d1, d2, d3}, [r0:256]! +; CHECK-NEXT: vst1.32 {d4, d5, d6, d7}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint32x4x4_t %b, 0, 0 %b1 = extractvalue %struct.uint32x4x4_t %b, 0, 1 @@ -292,9 +310,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u64_x2 -; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1q_u64_x2(i64* %a, %struct.uint64x2x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u64_x2(i64* %a, %struct.uint64x2x2_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u64_x2: +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x2_t %b, 0, 0 %b1 = extractvalue %struct.uint64x2x2_t %b, 0, 1 @@ -302,10 +321,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u64_x3 -; CHECK: vst1.64 {d16, d17, d18}, [r0:64]! -; CHECK: vst1.64 {d19, d20, d21}, [r0:64] -define void @test_vst1q_u64_x3(i64* %a, %struct.uint64x2x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u64_x3(i64* %a, %struct.uint64x2x3_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u64_x3: +; CHECK: vst1.64 {d0, d1, d2}, [r0:64]! +; CHECK-NEXT: vst1.64 {d3, d4, d5}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x3_t %b, 0, 0 %b1 = extractvalue %struct.uint64x2x3_t %b, 0, 1 @@ -314,10 +334,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u64_x4 -; CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]! -; CHECK: vst1.64 {d20, d21, d22, d23}, [r0:256] -define void @test_vst1q_u64_x4(i64* %a, %struct.uint64x2x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u64_x4(i64* %a, %struct.uint64x2x4_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u64_x4: +; CHECK: vst1.64 {d0, d1, d2, d3}, [r0:256]! +; CHECK-NEXT: vst1.64 {d4, d5, d6, d7}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint64x2x4_t %b, 0, 0 %b1 = extractvalue %struct.uint64x2x4_t %b, 0, 1 @@ -327,9 +348,10 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u8_x2 -; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256] -define void @test_vst1q_u8_x2(i8* %a, %struct.uint8x16x2_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u8_x2(i8* %a, %struct.uint8x16x2_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u8_x2: +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x2_t %b, 0, 0 %b1 = extractvalue %struct.uint8x16x2_t %b, 0, 1 @@ -337,10 +359,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u8_x3 -; CHECK: vst1.8 {d16, d17, d18}, [r0:64]! -; CHECK: vst1.8 {d19, d20, d21}, [r0:64] -define void @test_vst1q_u8_x3(i8* %a, %struct.uint8x16x3_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u8_x3(i8* %a, %struct.uint8x16x3_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u8_x3: +; CHECK: vst1.8 {d0, d1, d2}, [r0:64]! +; CHECK-NEXT: vst1.8 {d3, d4, d5}, [r0:64] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x3_t %b, 0, 0 %b1 = extractvalue %struct.uint8x16x3_t %b, 0, 1 @@ -349,10 +372,11 @@ entry: ret void } -; CHECK-LABEL: test_vst1q_u8_x4 -; CHECK: vst1.8 {d16, d17, d18, d19}, [r0:256]! -; CHECK: vst1.8 {d20, d21, d22, d23}, [r0:256] -define void @test_vst1q_u8_x4(i8* %a, %struct.uint8x16x4_t %b) nounwind { +define arm_aapcs_vfpcc void @test_vst1q_u8_x4(i8* %a, %struct.uint8x16x4_t %b) nounwind { +; CHECK-LABEL: test_vst1q_u8_x4: +; CHECK: vst1.8 {d0, d1, d2, d3}, [r0:256]! +; CHECK-NEXT: vst1.8 {d4, d5, d6, d7}, [r0:256] +; CHECK-NEXT: bx lr entry: %b0 = extractvalue %struct.uint8x16x4_t %b, 0, 0 %b1 = extractvalue %struct.uint8x16x4_t %b, 0, 1 @@ -361,3 +385,93 @@ entry: tail call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> %b0, <16 x i8> %b1, <16 x i8> %b2, <16 x i8> %b3) ret void } + +define void @postinc_1x2(i8* nocapture %0, i8* %1) { +; CHECK-LABEL: postinc_1x2: +; CHECK: vld1.8 {d16, d17, d18, d19}, [r1:256] +; CHECK-NEXT: add r1, r1, #32 +; CHECK-NEXT: vst1.8 {d16, d17, d18, d19}, [r0:256] +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.8 {d16, d17, d18, d19}, [r1:256] +; CHECK-NEXT: vst1.8 {d16, d17, d18, d19}, [r0:256] +; CHECK-NEXT: bx lr + %3 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x2.v16i8.p0i8(i8* %1) + %4 = extractvalue { <16 x i8>, <16 x i8> } %3, 0 + %5 = extractvalue { <16 x i8>, <16 x i8> } %3, 1 + tail call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %0, <16 x i8> %4, <16 x i8> %5) + %6 = getelementptr inbounds i8, i8* %1, i32 32 + %7 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x2.v16i8.p0i8(i8* nonnull %6) + %8 = extractvalue { <16 x i8>, <16 x i8> } %7, 0 + %9 = extractvalue { <16 x i8>, <16 x i8> } %7, 1 + %10 = getelementptr inbounds i8, i8* %0, i32 32 + tail call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* nonnull %10, <16 x i8> %8, <16 x i8> %9) + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x2.v16i8.p0i8(i8*) + +define void @postinc_1x3(i8* nocapture %0, i8* %1) { +; CHECK-LABEL: postinc_1x3: +; CHECK: add r2, r1, #48 +; CHECK-NEXT: vld1.8 {d16, d17, d18}, [r1:64]! +; CHECK-NEXT: vld1.8 {d19, d20, d21}, [r1:64] +; CHECK-NEXT: add r1, r0, #48 +; CHECK-NEXT: vst1.8 {d16, d17, d18}, [r0:64]! +; CHECK-NEXT: vst1.8 {d19, d20, d21}, [r0:64] +; CHECK-NEXT: vld1.8 {d16, d17, d18}, [r2:64]! +; CHECK-NEXT: vld1.8 {d19, d20, d21}, [r2:64] +; CHECK-NEXT: vst1.8 {d16, d17, d18}, [r1:64]! +; CHECK-NEXT: vst1.8 {d19, d20, d21}, [r1:64] +; CHECK-NEXT: bx lr + %3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x3.v16i8.p0i8(i8* %1) + %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %3, 0 + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %3, 1 + %a5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %3, 2 + tail call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %0, <16 x i8> %4, <16 x i8> %5, <16 x i8> %a5) + %6 = getelementptr inbounds i8, i8* %1, i32 48 + %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x3.v16i8.p0i8(i8* nonnull %6) + %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0 + %9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 1 + %a9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 2 + %10 = getelementptr inbounds i8, i8* %0, i32 48 + tail call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* nonnull %10, <16 x i8> %8, <16 x i8> %9, <16 x i8> %a9) + ret void +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x3.v16i8.p0i8(i8*) + +define void @postinc_1x4(i8* nocapture %0, i8* %1) { +; CHECK-LABEL: postinc_1x4: +; CHECK: add r2, r1, #64 +; CHECK-NEXT: vld1.8 {d16, d17, d18, d19}, [r1:256]! +; CHECK-NEXT: vld1.8 {d20, d21, d22, d23}, [r1:256] +; CHECK-NEXT: add r1, r0, #64 +; CHECK-NEXT: vst1.8 {d16, d17, d18, d19}, [r0:256]! +; CHECK-NEXT: vst1.8 {d20, d21, d22, d23}, [r0:256] +; CHECK-NEXT: vld1.8 {d16, d17, d18, d19}, [r2:256]! +; CHECK-NEXT: vld1.8 {d20, d21, d22, d23}, [r2:256] +; CHECK-NEXT: vorr q15, q11, q11 +; CHECK-NEXT: vorr q14, q10, q10 +; CHECK-NEXT: vorr q13, q9, q9 +; CHECK-NEXT: vorr q12, q8, q8 +; CHECK-NEXT: vst1.8 {d24, d25, d26, d27}, [r1:256]! +; CHECK-NEXT: vst1.8 {d28, d29, d30, d31}, [r1:256] +; CHECK-NEXT: bx lr + %3 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x4.v16i8.p0i8(i8* %1) + %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %3, 0 + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %3, 1 + %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %3, 2 + %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %3, 3 + tail call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %0, <16 x i8> %4, <16 x i8> %5, <16 x i8> %6, <16 x i8> %7) + %8 = getelementptr inbounds i8, i8* %1, i32 64 + %9 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x4.v16i8.p0i8(i8* nonnull %8) + %10 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %9, 0 + %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %9, 1 + %12 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %9, 2 + %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %9, 3 + %14 = getelementptr inbounds i8, i8* %0, i32 64 + tail call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* nonnull %14, <16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13) + ret void +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld1x4.v16i8.p0i8(i8*)