1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-22 12:33:33 +02:00
llvm-mirror/test/CodeGen/X86/MergeConsecutiveStores.ll
James Y Knight dff7c9bff1 Fix two issues in MergeConsecutiveStores:
1) PR25154. This is basically a repeat of PR18102, which was fixed in
r200201, and broken again by r234430. The latter changed which of the
store nodes was merged into from the first to the last. Thus, we now
also need to prefer merging a later store at a given address into the
target node, instead of an earlier one.

2) While investigating that, I also realized I'd introduced a bug in
r236850. There, I removed a check for alignment -- not realizing that
nothing except the alignment check was ensuring that none of the stores
were overlapping! This is a really bogus way to ensure there's no
aliased stores.

A better solution to both of these issues is likely to always use the
code added in the 'if (UseAA)' branches which rearrange the chain based
on a more principled analysis. I'll look into whether that can be used
always, but in the interest of getting things back to working, I think a
minimal change makes sense.

llvm-svn: 251816
2015-11-02 18:48:08 +00:00

551 lines
21 KiB
LLVM

; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
; CHECK-LABEL: merge_const_store:
; save 1,2,3 ... as one big integer.
; CHECK: movabsq $578437695752307201
; CHECK: ret
define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
%.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
%2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
store i8 1, i8* %2, align 1
%3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
store i8 2, i8* %3, align 1
%4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
store i8 3, i8* %4, align 1
%5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
store i8 4, i8* %5, align 1
%6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
store i8 5, i8* %6, align 1
%7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
store i8 6, i8* %7, align 1
%8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
store i8 7, i8* %8, align 1
%9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
store i8 8, i8* %9, align 1
%10 = add nsw i32 %i.02, 1
%11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
%exitcond = icmp eq i32 %10, %count
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
ret void
}
; No vectors because we use noimplicitfloat
; CHECK-LABEL: merge_const_store_no_vec:
; CHECK-NOT: vmovups
; CHECK: ret
define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
%.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
%2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
store i32 0, i32* %2, align 4
%3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
store i32 0, i32* %3, align 4
%4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
store i32 0, i32* %4, align 4
%5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
store i32 0, i32* %5, align 4
%6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
store i32 0, i32* %6, align 4
%7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
store i32 0, i32* %7, align 4
%8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
store i32 0, i32* %8, align 4
%9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
store i32 0, i32* %9, align 4
%10 = add nsw i32 %i.02, 1
%11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
%exitcond = icmp eq i32 %10, %count
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
ret void
}
; Move the constants using a single vector store.
; CHECK-LABEL: merge_const_store_vec:
; CHECK: vmovups
; CHECK: ret
define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
%.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
%2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
store i32 0, i32* %2, align 4
%3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
store i32 0, i32* %3, align 4
%4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
store i32 0, i32* %4, align 4
%5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
store i32 0, i32* %5, align 4
%6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
store i32 0, i32* %6, align 4
%7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
store i32 0, i32* %7, align 4
%8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
store i32 0, i32* %8, align 4
%9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
store i32 0, i32* %9, align 4
%10 = add nsw i32 %i.02, 1
%11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
%exitcond = icmp eq i32 %10, %count
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
ret void
}
; Move the first 4 constants as a single vector. Move the rest as scalars.
; CHECK-LABEL: merge_nonconst_store:
; CHECK: movl $67305985
; CHECK: movb
; CHECK: movb
; CHECK: movb
; CHECK: movb
; CHECK: ret
define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
%i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
%.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
%2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
store i8 1, i8* %2, align 1
%3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
store i8 2, i8* %3, align 1
%4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
store i8 3, i8* %4, align 1
%5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
store i8 4, i8* %5, align 1
%6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
store i8 %zz, i8* %6, align 1 ; <----------- Not a const;
%7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
store i8 6, i8* %7, align 1
%8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
store i8 7, i8* %8, align 1
%9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
store i8 8, i8* %9, align 1
%10 = add nsw i32 %i.02, 1
%11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
%exitcond = icmp eq i32 %10, %count
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge:
ret void
}
; CHECK-LABEL: merge_loads_i16:
; load:
; CHECK: movw
; store:
; CHECK: movw
; CHECK: ret
define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
%3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
br label %4
; <label>:4 ; preds = %4, %.lr.ph
%i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
%.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
%5 = load i8, i8* %2, align 1
%6 = load i8, i8* %3, align 1
%7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
store i8 %5, i8* %7, align 1
%8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
store i8 %6, i8* %8, align 1
%9 = add nsw i32 %i.02, 1
%10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
%exitcond = icmp eq i32 %9, %count
br i1 %exitcond, label %._crit_edge, label %4
._crit_edge: ; preds = %4, %0
ret void
}
; The loads and the stores are interleaved. Can't merge them.
; CHECK-LABEL: no_merge_loads:
; CHECK: movb
; CHECK: movb
; CHECK: movb
; CHECK: movb
; CHECK: ret
define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
%3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
br label %a4
a4: ; preds = %4, %.lr.ph
%i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
%.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
%a5 = load i8, i8* %2, align 1
%a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
store i8 %a5, i8* %a7, align 1
%a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
%a6 = load i8, i8* %3, align 1
store i8 %a6, i8* %a8, align 1
%a9 = add nsw i32 %i.02, 1
%a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
%exitcond = icmp eq i32 %a9, %count
br i1 %exitcond, label %._crit_edge, label %a4
._crit_edge: ; preds = %4, %0
ret void
}
; CHECK-LABEL: merge_loads_integer:
; load:
; CHECK: movq
; store:
; CHECK: movq
; CHECK: ret
define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
%3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
br label %4
; <label>:4 ; preds = %4, %.lr.ph
%i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
%.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
%5 = load i32, i32* %2
%6 = load i32, i32* %3
%7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
store i32 %5, i32* %7
%8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
store i32 %6, i32* %8
%9 = add nsw i32 %i.02, 1
%10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
%exitcond = icmp eq i32 %9, %count
br i1 %exitcond, label %._crit_edge, label %4
._crit_edge: ; preds = %4, %0
ret void
}
; CHECK-LABEL: merge_loads_vector:
; load:
; CHECK: movups
; store:
; CHECK: movups
; CHECK: ret
define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
%a1 = icmp sgt i32 %count, 0
br i1 %a1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
%a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
%a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
%a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
br label %block4
block4: ; preds = %4, %.lr.ph
%i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
%.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
%a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
%a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
%a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
%a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
%b1 = load i32, i32* %a2
%b2 = load i32, i32* %a3
%b3 = load i32, i32* %a4
%b4 = load i32, i32* %a5
store i32 %b1, i32* %a7
store i32 %b2, i32* %a8
store i32 %b3, i32* %a9
store i32 %b4, i32* %a10
%c9 = add nsw i32 %i.02, 1
%c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
%exitcond = icmp eq i32 %c9, %count
br i1 %exitcond, label %._crit_edge, label %block4
._crit_edge: ; preds = %4, %0
ret void
}
;; On x86, even unaligned copies should be merged to vector ops.
;; TODO: however, this cannot happen at the moment, due to brokenness
;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp
;; visitSTORE.
; CHECK-LABEL: merge_loads_no_align:
; load:
; CHECK-NOT: vmovups ;; TODO
; store:
; CHECK-NOT: vmovups ;; TODO
; CHECK: ret
define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
%a1 = icmp sgt i32 %count, 0
br i1 %a1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
%a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
%a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
%a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
br label %block4
block4: ; preds = %4, %.lr.ph
%i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
%.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
%a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
%a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
%a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
%a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
%b1 = load i32, i32* %a2, align 1
%b2 = load i32, i32* %a3, align 1
%b3 = load i32, i32* %a4, align 1
%b4 = load i32, i32* %a5, align 1
store i32 %b1, i32* %a7, align 1
store i32 %b2, i32* %a8, align 1
store i32 %b3, i32* %a9, align 1
store i32 %b4, i32* %a10, align 1
%c9 = add nsw i32 %i.02, 1
%c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
%exitcond = icmp eq i32 %c9, %count
br i1 %exitcond, label %._crit_edge, label %block4
._crit_edge: ; preds = %4, %0
ret void
}
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffset:
; CHECK: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
; CHECK: movw [[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
br label %1
; <label>:1
%.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
%.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
%.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
%2 = getelementptr inbounds i64, i64* %.0, i64 1
%3 = load i64, i64* %.0, align 1
%4 = getelementptr inbounds i8, i8* %c, i64 %3
%5 = load i8, i8* %4, align 1
%6 = add i64 %3, 1
%7 = getelementptr inbounds i8, i8* %c, i64 %6
%8 = load i8, i8* %7, align 1
store i8 %5, i8* %.08, align 1
%9 = getelementptr inbounds i8, i8* %.08, i64 1
store i8 %8, i8* %9, align 1
%10 = getelementptr inbounds i8, i8* %.08, i64 2
%11 = add nsw i32 %.09, -1
%12 = icmp eq i32 %11, 0
br i1 %12, label %13, label %1
; <label>:13
ret void
}
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
; CHECK: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
; CHECK: movw [[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
br label %1
; <label>:1
%.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
%.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
%.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
%2 = getelementptr inbounds i8, i8* %.0, i64 1
%3 = load i8, i8* %.0, align 1
%4 = sext i8 %3 to i64
%5 = getelementptr inbounds i8, i8* %c, i64 %4
%6 = load i8, i8* %5, align 1
%7 = add i64 %4, 1
%8 = getelementptr inbounds i8, i8* %c, i64 %7
%9 = load i8, i8* %8, align 1
store i8 %6, i8* %.08, align 1
%10 = getelementptr inbounds i8, i8* %.08, i64 1
store i8 %9, i8* %10, align 1
%11 = getelementptr inbounds i8, i8* %.08, i64 2
%12 = add nsw i32 %.09, -1
%13 = icmp eq i32 %12, 0
br i1 %13, label %14, label %1
; <label>:14
ret void
}
; However, we can only merge ignore sign extensions when they are on all memory
; computations;
; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex:
; CHECK-NOT: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
; CHECK-NOT: movw [[REG]], (%{{.*}})
define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
br label %1
; <label>:1
%.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
%.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
%.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
%2 = getelementptr inbounds i8, i8* %.0, i64 1
%3 = load i8, i8* %.0, align 1
%4 = sext i8 %3 to i64
%5 = getelementptr inbounds i8, i8* %c, i64 %4
%6 = load i8, i8* %5, align 1
%7 = add i8 %3, 1
%wrap.4 = sext i8 %7 to i64
%8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
%9 = load i8, i8* %8, align 1
store i8 %6, i8* %.08, align 1
%10 = getelementptr inbounds i8, i8* %.08, i64 1
store i8 %9, i8* %10, align 1
%11 = getelementptr inbounds i8, i8* %.08, i64 2
%12 = add nsw i32 %.09, -1
%13 = icmp eq i32 %12, 0
br i1 %13, label %14, label %1
; <label>:14
ret void
}
; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
%vecext0 = extractelement <8 x float> %v, i32 0
%vecext1 = extractelement <8 x float> %v, i32 1
%vecext2 = extractelement <8 x float> %v, i32 2
%vecext3 = extractelement <8 x float> %v, i32 3
%vecext4 = extractelement <8 x float> %v, i32 4
%vecext5 = extractelement <8 x float> %v, i32 5
%vecext6 = extractelement <8 x float> %v, i32 6
%vecext7 = extractelement <8 x float> %v, i32 7
%arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
%arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
%arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
%arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
%arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
%arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
%arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
store float %vecext0, float* %ptr, align 4
store float %vecext1, float* %arrayidx1, align 4
store float %vecext2, float* %arrayidx2, align 4
store float %vecext3, float* %arrayidx3, align 4
store float %vecext4, float* %arrayidx4, align 4
store float %vecext5, float* %arrayidx5, align 4
store float %vecext6, float* %arrayidx6, align 4
store float %vecext7, float* %arrayidx7, align 4
ret void
; CHECK-LABEL: merge_vec_element_store
; CHECK: vmovups
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
}
; PR21711 - Merge vector stores into wider vector stores.
; These should be merged into 32-byte stores.
define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
%idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
%idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
%idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
%shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
ret void
; CHECK-LABEL: merge_vec_extract_stores
; CHECK: vmovups %ymm0, 48(%rdi)
; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
}
; Merging vector stores when sourced from vector loads is not currently handled.
define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
%load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
%load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
%v0 = load <4 x float>, <4 x float>* %load_idx0
%v1 = load <4 x float>, <4 x float>* %load_idx1
%store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
%store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
store <4 x float> %v0, <4 x float>* %store_idx0, align 16
store <4 x float> %v1, <4 x float>* %store_idx1, align 16
ret void
; CHECK-LABEL: merge_vec_stores_from_loads
; CHECK: vmovaps
; CHECK-NEXT: vmovaps
; CHECK-NEXT: vmovaps
; CHECK-NEXT: vmovaps
; CHECK-NEXT: retq
}
; Merging vector stores when sourced from a constant vector is not currently handled.
define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
ret void
; CHECK-LABEL: merge_vec_stores_of_constants
; CHECK: vxorps
; CHECK-NEXT: vmovaps
; CHECK-NEXT: vmovaps
; CHECK-NEXT: retq
}
; This is a minimized test based on real code that was failing.
; We could merge stores (and loads) like this...
define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
%idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
%idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
%idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
%idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
%a0 = load i64, i64* %idx0, align 8
store i64 %a0, i64* %idx4, align 8
%b = bitcast i64* %idx1 to <2 x i64>*
%v = load <2 x i64>, <2 x i64>* %b, align 8
%a1 = extractelement <2 x i64> %v, i32 0
store i64 %a1, i64* %idx5, align 8
ret void
; CHECK-LABEL: merge_vec_element_and_scalar_load
; CHECK: movq (%rdi), %rax
; CHECK-NEXT: movq %rax, 32(%rdi)
; CHECK-NEXT: movq 8(%rdi), %rax
; CHECK-NEXT: movq %rax, 40(%rdi)
; CHECK-NEXT: retq
}