mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
855b755233
Currently combineInsertEltToShuffle turns insert_vector_elt into a vector_shuffle, even if the inserted element is a vector with a single element. In this case, it should be unlikely that the additional shuffle would be more efficient than a insert_vector_elt. Additionally, this fixes a infinite cycle in DAGCombine, where combineInsertEltToShuffle turns a insert_vector_elt into a shuffle, which gets turned back into a insert_vector_elt/extract_vector_elt by a custom AArch64 lowering (in visitVECTOR_SHUFFLE). Such insert_vector_elt and extract_vector_elt combinations can be lowered efficiently using mov on AArch64. There are 2 test changes in arm64-neon-copy.ll: we now use one or two mov instructions instead of a single zip1. The reason that we need a second mov in ins1f2 is that we have to move the result to the result register and is not really related to the DAGCombine fold I think. But in any case, on most uarchs, mov should be cheaper than zip1. On a Cortex-A75 for example, zip1 is twice as expensive as mov (https://developer.arm.com/docs/101398/latest/arm-cortex-a75-software-optimization-guide-v20) Reviewers: spatel, efriedma, dmgreen, RKSimon Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D80710
36 lines
1.1 KiB
LLVM
36 lines
1.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc %s -o - | FileCheck %s
|
|
|
|
target triple = "arm64-apple-ios13.4.0"
|
|
|
|
; Make we do not get stuck in a cycle in DAGCombiner.
|
|
|
|
define void @test(i1 %c, <1 x double>* %ptr) {
|
|
; CHECK-LABEL: test:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: movi d0, #0000000000000000
|
|
; CHECK-NEXT: tbz w0, #0, LBB0_2
|
|
; CHECK-NEXT: ; %bb.1: ; %bb1
|
|
; CHECK-NEXT: ldr d0, [x1]
|
|
; CHECK-NEXT: LBB0_2: ; %bb2
|
|
; CHECK-NEXT: ldr q1, [x8]
|
|
; CHECK-NEXT: mov.d v1[0], v0[0]
|
|
; CHECK-NEXT: str q1, [x8]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
br i1 %c, label %bb1, label %bb2
|
|
|
|
bb1:
|
|
%lv1 = load <1 x double>, <1 x double>* %ptr, align 16
|
|
br label %bb2
|
|
|
|
bb2:
|
|
%p = phi <1 x double> [ %lv1, %bb1 ], [ zeroinitializer, %entry ]
|
|
%vecext19 = extractelement <1 x double> %p, i32 0
|
|
%arrayidx21 = getelementptr inbounds [4 x <4 x double>], [4 x <4 x double>]* undef, i64 0, i64 3
|
|
%lv2 = load <4 x double>, <4 x double>* %arrayidx21, align 16
|
|
%vecins22 = insertelement <4 x double> %lv2, double %vecext19, i32 2
|
|
store <4 x double> %vecins22, <4 x double>* %arrayidx21, align 16
|
|
ret void
|
|
}
|