1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[AArch64] Avoid unnecessary vector byte-swapping in big-endian

Summary:
Loads/stores of some NEON vector types are promoted to other vector
types with different lane sizes but same vector size. This is not a
problem in little-endian but, when in big-endian, it requires
additional byte reversals required to preserve the lane ordering
while keeping the right endianness of the data inside each lane.
For example:

%1 = load <4 x half>, <4 x half>* %p

results in the following assembly:

ld1 { v0.2s }, [x1]
rev32 v0.4h, v0.4h

This patch changes the promotion of these loads/stores so that the
actual vector load/store (LD1/ST1) takes care of the endianness
correctly and there is no need for further byte reversals. The
previous code now results in the following assembly:

ld1 { v0.4h }, [x1]

Reviewers: olista01, SjoerdMeijer, efriedma

Reviewed By: efriedma

Subscribers: aemerson, rengolin, javed.absar, llvm-commits, kristof.beyls

Differential Revision: https://reviews.llvm.org/D42235

llvm-svn: 323325
This commit is contained in:
Pablo Barrio 2018-01-24 14:13:47 +00:00
parent 4391d3c641
commit 818f6ba8dc
2 changed files with 75 additions and 66 deletions

View File

@ -721,18 +721,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
if (VT == MVT::v2f32 || VT == MVT::v4f16) {
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
assert(VT.isVector() && "VT should be a vector type");
setOperationAction(ISD::STORE, VT, Promote);
AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
} else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
setOperationAction(ISD::STORE, VT, Promote);
AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
if (VT.isFloatingPoint()) {
MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
}
// Mark vector float intrinsics as expand.

View File

@ -53,8 +53,9 @@ define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
; CHECK-LABEL: test_i64_v4f16:
define void @test_i64_v4f16(<4 x half>* %p, i64* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: ld1 { v{{[0-9]+}}.4h }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev64 v{{[0-9]+}}.4h
; CHECK: str
%1 = load <4 x half>, <4 x half>* %p
@ -156,9 +157,11 @@ define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
; CHECK-LABEL: test_f64_v4f16:
define void @test_f64_v4f16(<4 x half>* %p, double* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: ld1 { v{{[0-9]+}}.4h }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev64 v{{[0-9]+}}.4h
; CHECK: fadd
; CHECK: str
%1 = load <4 x half>, <4 x half>* %p
%2 = fadd <4 x half> %1, %1
@ -233,8 +236,9 @@ define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
; CHECK-LABEL: test_v1i64_v4f16:
define void @test_v1i64_v4f16(<4 x half>* %p, <1 x i64>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: ld1 { v{{[0-9]+}}.4h }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev64 v{{[0-9]+}}.4h
; CHECK: str
%1 = load <4 x half>, <4 x half>* %p
@ -337,8 +341,9 @@ define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
; CHECK-LABEL: test_v2f32_v4f16:
define void @test_v2f32_v4f16(<4 x half>* %p, <2 x float>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: ld1 { v{{[0-9]+}}.4h }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
%1 = load <4 x half>, <4 x half>* %p
@ -506,8 +511,8 @@ define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
; CHECK-LABEL: test_v4i16_v4f16:
define void @test_v4i16_v4f16(<4 x half>* %p, <4 x i16>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: ld1 { v{{[0-9]+}}.4h }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load <4 x half>, <4 x half>* %p
%2 = fadd <4 x half> %1, %1
@ -534,8 +539,9 @@ define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
define void @test_v4f16_i64(i64* %p, <4 x half>* %q) {
; CHECK: ldr
; CHECK: rev64 v{{[0-9]+}}.4h
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load i64, i64* %p
%2 = add i64 %1, %1
%3 = bitcast i64 %2 to <4 x half>
@ -548,8 +554,9 @@ define void @test_v4f16_i64(i64* %p, <4 x half>* %q) {
define void @test_v4f16_f64(double* %p, <4 x half>* %q) {
; CHECK: ldr
; CHECK: rev64 v{{[0-9]+}}.4h
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load double, double* %p
%2 = fadd double %1, %1
%3 = bitcast double %2 to <4 x half>
@ -562,8 +569,9 @@ define void @test_v4f16_f64(double* %p, <4 x half>* %q) {
define void @test_v4f16_v1i64(<1 x i64>* %p, <4 x half>* %q) {
; CHECK: ldr
; CHECK: rev64 v{{[0-9]+}}.4h
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load <1 x i64>, <1 x i64>* %p
%2 = add <1 x i64> %1, %1
%3 = bitcast <1 x i64> %2 to <4 x half>
@ -576,8 +584,9 @@ define void @test_v4f16_v1i64(<1 x i64>* %p, <4 x half>* %q) {
define void @test_v4f16_v2f32(<2 x float>* %p, <4 x half>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load <2 x float>, <2 x float>* %p
%2 = fadd <2 x float> %1, %1
%3 = bitcast <2 x float> %2 to <4 x half>
@ -590,8 +599,9 @@ define void @test_v4f16_v2f32(<2 x float>* %p, <4 x half>* %q) {
define void @test_v4f16_v2i32(<2 x i32>* %p, <4 x half>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2s }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load <2 x i32>, <2 x i32>* %p
%2 = add <2 x i32> %1, %1
%3 = bitcast <2 x i32> %2 to <4 x half>
@ -603,8 +613,8 @@ define void @test_v4f16_v2i32(<2 x i32>* %p, <4 x half>* %q) {
; CHECK-LABEL: test_v4f16_v4i16:
define void @test_v4f16_v4i16(<4 x i16>* %p, <4 x half>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.4h }
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load <4 x i16>, <4 x i16>* %p
%2 = add <4 x i16> %1, %1
%3 = bitcast <4 x i16> %2 to <4 x half>
@ -617,8 +627,9 @@ define void @test_v4f16_v4i16(<4 x i16>* %p, <4 x half>* %q) {
define void @test_v4f16_v8i8(<8 x i8>* %p, <4 x half>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.8b }
; CHECK: rev16 v{{[0-9]+}}.8b
; CHECK: rev32 v{{[0-9]+}}.4h
; CHECK: st1 { v{{[0-9]+}}.2s }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4h }
%1 = load <8 x i8>, <8 x i8>* %p
%2 = add <8 x i8> %1, %1
%3 = bitcast <8 x i8> %2 to <4 x half>
@ -733,8 +744,9 @@ define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
; CHECK-LABEL: test_f128_v4f32:
define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ext
; CHECK: str q
@ -814,8 +826,9 @@ define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
; CHECK-LABEL: test_v2f64_v4f32:
define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
%1 = load <4 x float>, <4 x float>* %p
@ -892,9 +905,11 @@ define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
; CHECK-LABEL: test_v2i64_v4f32:
define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK-NOT: rev
; CHECK: fadd
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: add
; CHECK: st1 { v{{[0-9]+}}.2d }
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
@ -948,8 +963,8 @@ define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
; CHECK: ldr q
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ext
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load fp128, fp128* %p
%2 = fadd fp128 %1, %1
%3 = bitcast fp128 %2 to <4 x float>
@ -962,8 +977,8 @@ define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load <2 x double>, <2 x double>* %p
%2 = fadd <2 x double> %1, %1
%3 = bitcast <2 x double> %2 to <4 x float>
@ -976,8 +991,9 @@ define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
; CHECK: fadd
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load <2 x i64>, <2 x i64>* %p
%2 = add <2 x i64> %1, %1
%3 = bitcast <2 x i64> %2 to <4 x float>
@ -989,8 +1005,8 @@ define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
; CHECK-LABEL: test_v4f32_v4i32:
define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load <4 x i32>, <4 x i32>* %p
%2 = add <4 x i32> %1, %1
%3 = bitcast <4 x i32> %2 to <4 x float>
@ -1003,8 +1019,8 @@ define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.8h }
; CHECK: rev32 v{{[0-9]+}}.8h
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load <8 x i16>, <8 x i16>* %p
%2 = add <8 x i16> %1, %1
%3 = bitcast <8 x i16> %2 to <4 x float>
@ -1017,8 +1033,8 @@ define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.16b }
; CHECK: rev32 v{{[0-9]+}}.16b
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: st1 { v{{[0-9]+}}.2d }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load <16 x i8>, <16 x i8>* %p
%2 = add <16 x i8> %1, %1
%3 = bitcast <16 x i8> %2 to <4 x float>
@ -1069,8 +1085,8 @@ define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
; CHECK-LABEL: test_v4i32_v4f32:
define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.4s }
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
@ -1148,9 +1164,9 @@ define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
; CHECK-LABEL: test_v8i16_v4f32:
define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK: rev32 v{{[0-9]+}}.8h
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.8h }
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
@ -1175,8 +1191,8 @@ define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
; CHECK-LABEL: test_v8i16_v8f16:
define void @test_v8i16_v8f16(<8 x half>* %p, <8 x i16>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.8h
; CHECK: ld1 { v{{[0-9]+}}.8h }
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.8h }
%1 = load <8 x half>, <8 x half>* %p
%2 = fadd <8 x half> %1, %1
@ -1241,9 +1257,9 @@ define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
; CHECK-LABEL: test_v16i8_v4f32:
define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.4s
; CHECK: ld1 { v{{[0-9]+}}.4s }
; CHECK: rev32 v{{[0-9]+}}.16b
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.16b }
%1 = load <4 x float>, <4 x float>* %p
%2 = fadd <4 x float> %1, %1
@ -1268,9 +1284,9 @@ define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
; CHECK-LABEL: test_v16i8_v8f16:
define void @test_v16i8_v8f16(<8 x half>* %p, <16 x i8>* %q) {
; CHECK: ld1 { v{{[0-9]+}}.2d }
; CHECK: rev64 v{{[0-9]+}}.8h
; CHECK: ld1 { v{{[0-9]+}}.8h }
; CHECK: rev16 v{{[0-9]+}}.16b
; CHECK-NOT: rev
; CHECK: st1 { v{{[0-9]+}}.16b }
%1 = load <8 x half>, <8 x half>* %p
%2 = fadd <8 x half> %1, %1
@ -1297,9 +1313,8 @@ define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
%struct.struct1 = type { half, half, half, half }
define %struct.struct1 @test_v4f16_struct(%struct.struct1* %ret) {
entry:
; CHECK: ld1 { {{v[0-9]+}}.2s }
; CHECK: rev32
; CHECK-NOT; rev64
; CHECK: ld1 { {{v[0-9]+}}.4h }
; CHECK-NOT: rev
%0 = bitcast %struct.struct1* %ret to <4 x half>*
%1 = load <4 x half>, <4 x half>* %0, align 2
%2 = extractelement <4 x half> %1, i32 0