From 3f0fc5a029fc8d2f7fb4c1be23328861ed7a01fe Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Mon, 5 Jan 2015 17:02:28 +0000 Subject: [PATCH] [AArch64] Improve codegen of store lane 0 instructions by directly storing the subregister. For 0-lane stores, we used to generate code similar to: fmov w8, s0 str w8, [x0, x1, lsl #2] instead of: str s0, [x0, x1, lsl #2] To correct that: for store lane 0 patterns, directly match to STR 0. Byte-sized instructions don't have the special case for a 0 index, because FPR8s are defined to have untyped content. rdar://16372710 Differential Revision: http://reviews.llvm.org/D6772 llvm-svn: 225181 --- lib/Target/AArch64/AArch64InstrInfo.td | 27 ++++++++ test/CodeGen/AArch64/arm64-st1.ll | 92 ++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index f4a555499d2..cae02d0a32e 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -1889,6 +1889,33 @@ let Predicates = [IsLE] in { } } // AddedComplexity = 10 +// Match stores from lane 0 to the appropriate subreg's store. +multiclass VecROStoreLane0Pat { + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 19 in { + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; +} + //--- // (unsigned immediate) defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", diff --git a/test/CodeGen/AArch64/arm64-st1.ll b/test/CodeGen/AArch64/arm64-st1.ll index 4370484478c..a4818bd8850 100644 --- a/test/CodeGen/AArch64/arm64-st1.ll +++ b/test/CodeGen/AArch64/arm64-st1.ll @@ -8,6 +8,16 @@ define void @st1lane_16b(<16 x i8> %A, i8* %D) { ret void } +define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_16b +; CHECK: umov.b w[[WREG:[0-9]+]], v0[0] +; CHECK: strb w[[WREG]], [x0, x1] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <16 x i8> %A, i32 0 + store i8 %tmp, i8* %ptr + ret void +} + define void @st1lane_8h(<8 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane_8h ; CHECK: st1.h @@ -16,6 +26,15 @@ define void @st1lane_8h(<8 x i16> %A, i16* %D) { ret void } +define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_8h +; CHECK: str h0, [x0, x1, lsl #1] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <8 x i16> %A, i32 0 + store i16 %tmp, i16* %ptr + ret void +} + define void @st1lane_4s(<4 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane_4s ; CHECK: st1.s @@ -24,6 +43,15 @@ define void @st1lane_4s(<4 x i32> %A, i32* %D) { ret void } +define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_4s +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <4 x i32> %A, i32 0 + store i32 %tmp, i32* %ptr + ret void +} + define void @st1lane_4s_float(<4 x float> %A, float* %D) { ; CHECK-LABEL: st1lane_4s_float ; CHECK: st1.s @@ -32,6 +60,15 @@ define void @st1lane_4s_float(<4 x float> %A, float* %D) { ret void } +define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_4s_float +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <4 x float> %A, i32 0 + store float %tmp, float* %ptr + ret void +} + define void @st1lane_2d(<2 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane_2d ; CHECK: st1.d @@ -40,6 +77,15 @@ define void @st1lane_2d(<2 x i64> %A, i64* %D) { ret void } +define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2d +; CHECK: str d0, [x0, x1, lsl #3] + %ptr = getelementptr i64* %D, i64 %offset + %tmp = extractelement <2 x i64> %A, i32 0 + store i64 %tmp, i64* %ptr + ret void +} + define void @st1lane_2d_double(<2 x double> %A, double* %D) { ; CHECK-LABEL: st1lane_2d_double ; CHECK: st1.d @@ -48,6 +94,15 @@ define void @st1lane_2d_double(<2 x double> %A, double* %D) { ret void } +define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2d_double +; CHECK: str d0, [x0, x1, lsl #3] + %ptr = getelementptr double* %D, i64 %offset + %tmp = extractelement <2 x double> %A, i32 0 + store double %tmp, double* %ptr + ret void +} + define void @st1lane_8b(<8 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane_8b ; CHECK: st1.b @@ -56,6 +111,16 @@ define void @st1lane_8b(<8 x i8> %A, i8* %D) { ret void } +define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_8b +; CHECK: umov.b w[[WREG:[0-9]+]], v0[0] +; CHECK: strb w[[WREG]], [x0, x1] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <8 x i8> %A, i32 0 + store i8 %tmp, i8* %ptr + ret void +} + define void @st1lane_4h(<4 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane_4h ; CHECK: st1.h @@ -64,6 +129,15 @@ define void @st1lane_4h(<4 x i16> %A, i16* %D) { ret void } +define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_4h +; CHECK: str h0, [x0, x1, lsl #1] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <4 x i16> %A, i32 0 + store i16 %tmp, i16* %ptr + ret void +} + define void @st1lane_2s(<2 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane_2s ; CHECK: st1.s @@ -72,6 +146,15 @@ define void @st1lane_2s(<2 x i32> %A, i32* %D) { ret void } +define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2s +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <2 x i32> %A, i32 0 + store i32 %tmp, i32* %ptr + ret void +} + define void @st1lane_2s_float(<2 x float> %A, float* %D) { ; CHECK-LABEL: st1lane_2s_float ; CHECK: st1.s @@ -80,6 +163,15 @@ define void @st1lane_2s_float(<2 x float> %A, float* %D) { ret void } +define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2s_float +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <2 x float> %A, i32 0 + store float %tmp, float* %ptr + ret void +} + define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) { ; CHECK-LABEL: st2lane_16b ; CHECK: st2.b