mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-22 04:22:57 +02:00
33bcd3de54
This is a redo of D7208 ( r227242 - http://llvm.org/viewvc/llvm-project?view=revision&revision=227242 ). The patch was reverted because an AArch64 target could infinite loop after the change in DAGCombiner to merge vector stores. That happened because AArch64's allowsMisalignedMemoryAccesses() wasn't telling the truth. It reported all unaligned memory accesses as fast, but then split some 128-bit unaligned accesses up in performSTORECombine() because they are slow. This patch attempts to fix the problem in AArch's allowsMisalignedMemoryAccesses() while preserving existing (perhaps questionable) lowering behavior. The x86 test shows that store merging is working as intended for a target with fast 32-byte unaligned stores. Differential Revision: http://reviews.llvm.org/D12635 llvm-svn: 248622
51 lines
1.9 KiB
LLVM
51 lines
1.9 KiB
LLVM
; RUN: llc -march aarch64 %s -o - | FileCheck %s
|
|
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone | FileCheck %s --check-prefix=CYCLONE
|
|
|
|
@g0 = external global <3 x float>, align 16
|
|
@g1 = external global <3 x float>, align 4
|
|
|
|
; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
|
|
; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
|
|
; CHECK: str d[[R0]]
|
|
|
|
define void @blam() {
|
|
%tmp4 = getelementptr inbounds <3 x float>, <3 x float>* @g1, i64 0, i64 0
|
|
%tmp5 = load <3 x float>, <3 x float>* @g0, align 16
|
|
%tmp6 = extractelement <3 x float> %tmp5, i64 0
|
|
store float %tmp6, float* %tmp4
|
|
%tmp7 = getelementptr inbounds float, float* %tmp4, i64 1
|
|
%tmp8 = load <3 x float>, <3 x float>* @g0, align 16
|
|
%tmp9 = extractelement <3 x float> %tmp8, i64 1
|
|
store float %tmp9, float* %tmp7
|
|
ret void;
|
|
}
|
|
|
|
|
|
; PR21711 - Merge vector stores into wider vector stores.
|
|
|
|
; On Cyclone, the stores should not get merged into a 16-byte store because
|
|
; unaligned 16-byte stores are slow. This test would infinite loop when
|
|
; the fastness of unaligned accesses was not specified correctly.
|
|
|
|
define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
|
|
%idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
|
|
%idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
|
|
|
|
%shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
|
%shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
|
|
|
store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
|
|
store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
|
|
ret void
|
|
|
|
; CHECK-LABEL: merge_vec_extract_stores
|
|
; CHECK: stur q0, [x0, #24]
|
|
; CHECK-NEXT: ret
|
|
|
|
; CYCLONE-LABEL: merge_vec_extract_stores
|
|
; CYCLONE: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CYCLONE-NEXT: str d0, [x0, #24]
|
|
; CYCLONE-NEXT: str d1, [x0, #32]
|
|
; CYCLONE-NEXT: ret
|
|
}
|