1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00
llvm-mirror/test/CodeGen/ARM/vst3.ll
Florian Hahn 06a6d27ef3 [ARM] Fix codegen for VLD3/VLD4/VST3/VST4 with WB
Code generation of VLD3, VLD4, VST3 and VST4 with register writeback is
broken due to 2 separate bugs:

1) VLD1d64TPseudoWB_register and VLD1d64QPseudoWB_register are missing
   rules to expand them to non pseudo MIR. These are selected for
   ARMISD::VLD3_UPD/VLD4_UPD with v1i64 vectors in SelectVLD.

2) Selection of the right VLD/VST instruction is broken for load and
   store of 3 and 4 v1i64 vectors. SelectVLD and SelectVST are called
   with MIR opcode for fixed writeback (ie increment is access size)
   and call getVLDSTRegisterUpdateOpcode() to select an opcode with
   register writeback if base register update is of a different size.
   Since getVLDSTRegisterUpdateOpcode() only knows about
   VLD1/VLD2/VST1/VST2 the call is currently conditional on the number
   of element in the vector.

   However, VLD1/VST1 is selected by SelectVLD/SelectVST's caller for
   load and stores of 3 or 4 v1i64 vectors. Therefore the opcode is not
   updated which later lead to a fixed writeback instruction being
   constructed with an extra operand for the register writeback.

This patch addresses the two issues as follows:
- it adds the necessary mapping from VLD1d64TPseudoWB_register and
  VLD1d64QPseudoWB_register to VLD1d64Twb_register and
  VLD1d64Qwb_register respectively. Like for the existing _fixed
  variants, the cost of these is bumped for unaligned access.
- it changes the logic in SelectVLD and SelectVSD to call isVLDfixed
  and isVSTfixed respectively to decide whether the opcode should be
  updated. It also reworks the logic and comments for pushing the
  writeback offset operand and r0 operand to clarify the logic:
  writeback offset needs to be pushed if it's a register writeback,
  r0 needs to be pushed if not and the instruction is a
  VLD1/VLD2/VST1/VST2.

Reviewers: rengolin, t.p.northover, samparker

Reviewed By: samparker

Patch by Thomas Preud'homme <thomas.preudhomme@arm.com>

Differential Revision: https://reviews.llvm.org/D42970

llvm-svn: 326570
2018-03-02 13:02:55 +00:00

153 lines
6.1 KiB
LLVM

; RUN: llc -mtriple=arm-eabi -mattr=+neon -fast-isel=0 -O0 %s -o - | FileCheck %s
define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: vst3i8:
;Check the alignment value. Max for this instruction is 64 bits:
;This test runs at -O0 so do not check for specific register numbers.
;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
%tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
ret void
}
define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK-LABEL: vst3i16:
;CHECK: vst3.16
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
ret void
}
define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK-LABEL: vst3i32:
;CHECK: vst3.32
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
ret void
}
;Check for a post-increment updating store.
define void @vst3i32_update(i32** %ptr, <2 x i32>* %B) nounwind {
;CHECK-LABEL: vst3i32_update:
;CHECK: vst3.32 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
%A = load i32*, i32** %ptr
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
%tmp2 = getelementptr i32, i32* %A, i32 6
store i32* %tmp2, i32** %ptr
ret void
}
define void @vst3f(float* %A, <2 x float>* %B) nounwind {
;CHECK-LABEL: vst3f:
;CHECK: vst3.32
%tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
ret void
}
define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK-LABEL: vst3i64:
;Check the alignment value. Max for this instruction is 64 bits:
;This test runs at -O0 so do not check for specific register numbers.
;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
ret void
}
define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
;CHECK-LABEL: vst3i64_update
;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
%A = load i64*, i64** %ptr
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
%tmp2 = getelementptr i64, i64* %A, i32 3
store i64* %tmp2, i64** %ptr
ret void
}
define void @vst3i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind {
;CHECK-LABEL: vst3i64_reg_update
;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}], r{{.*}}
%A = load i64*, i64** %ptr
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
%tmp2 = getelementptr i64, i64* %A, i32 1
store i64* %tmp2, i64** %ptr
ret void
}
define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK-LABEL: vst3Qi8:
;Check the alignment value. Max for this instruction is 64 bits:
;This test runs at -O0 so do not check for specific register numbers.
;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]!
;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
%tmp1 = load <16 x i8>, <16 x i8>* %B
call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
ret void
}
define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK-LABEL: vst3Qi16:
;CHECK: vst3.16
;CHECK: vst3.16
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
ret void
}
;Check for a post-increment updating store.
define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
;CHECK-LABEL: vst3Qi16_update:
;CHECK: vst3.16 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
;CHECK: vst3.16 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
%A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
%tmp2 = getelementptr i16, i16* %A, i32 24
store i16* %tmp2, i16** %ptr
ret void
}
define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK-LABEL: vst3Qi32:
;CHECK: vst3.32
;CHECK: vst3.32
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
ret void
}
define void @vst3Qf(float* %A, <4 x float>* %B) nounwind {
;CHECK-LABEL: vst3Qf:
;CHECK: vst3.32
;CHECK: vst3.32
%tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
ret void
}
declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst3.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind