mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-25 05:52:53 +02:00
ade2d32df0
This provides more realistic costs for the insert/extractelement instructions (which are load/store pairs), accounts for the cheap unaligned Altivec load sequence, and for unaligned VSX load/stores. Bad news: MultiSource/Applications/sgefa/sgefa - 35% slowdown (this will require more investigation) SingleSource/Benchmarks/McGill/queens - 20% slowdown (we no longer vectorize this, but it was a constant store that was scalarized) MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2 - 2% slowdown Good news: SingleSource/Benchmarks/Shootout/ary3 - 54% speedup SingleSource/Benchmarks/Shootout-C++/ary - 40% speedup MultiSource/Benchmarks/Ptrdist/ks/ks - 35% speedup MultiSource/Benchmarks/FreeBench/neural/neural - 30% speedup MultiSource/Benchmarks/TSVC/Symbolics-flt/Symbolics-flt - 20% speedup Unfortunately, estimating the costs of the stack-based scalarization sequences is hard, and adjusting these costs is like a game of whac-a-mole :( I'll revisit this again after we have better codegen for vector extloads and truncstores and unaligned load/stores. llvm-svn: 205658
17 lines
568 B
LLVM
17 lines
568 B
LLVM
; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
|
|
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
|
|
target triple = "powerpc64-unknown-linux-gnu"
|
|
|
|
define i32 @insert(i32 %arg) {
|
|
; CHECK: cost of 10 {{.*}} insertelement
|
|
%x = insertelement <4 x i32> undef, i32 %arg, i32 0
|
|
ret i32 undef
|
|
}
|
|
|
|
define i32 @extract(<4 x i32> %arg) {
|
|
; CHECK: cost of 3 {{.*}} extractelement
|
|
%x = extractelement <4 x i32> %arg, i32 0
|
|
ret i32 %x
|
|
}
|
|
|