mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[SVE] Add support for folding for select + masked loads
Add folds to instcombine to support the removal of select instruction when the masked_load is guaranteed to zero the same lanes, i.e. select(mask, mload(,,mask,0), 0) -> mload(,,mask,0). Patch originally authored by @paulwalker-arm Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D106376
This commit is contained in:
parent
9af238dfc2
commit
6f6b3d4f7a
@ -2109,6 +2109,14 @@ template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
|
||||
return IntrinsicID_match(IntrID);
|
||||
}
|
||||
|
||||
/// Matches MaskedLoad Intrinsic.
|
||||
template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
|
||||
inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
|
||||
m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
|
||||
const Opnd3 &Op3) {
|
||||
return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
|
||||
}
|
||||
|
||||
template <Intrinsic::ID IntrID, typename T0>
|
||||
inline typename m_Intrinsic_Ty<T0>::Ty m_Intrinsic(const T0 &Op0) {
|
||||
return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));
|
||||
|
@ -3216,5 +3216,36 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
|
||||
if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
|
||||
return replaceInstUsesWith(SI, Fr);
|
||||
|
||||
// select(mask, mload(,,mask,0), 0) -> mload(,,mask,0)
|
||||
// Load inst is intentionally not checked for hasOneUse()
|
||||
if (match(FalseVal, m_Zero()) &&
|
||||
match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
|
||||
m_CombineOr(m_Undef(), m_Zero())))) {
|
||||
auto *MaskedLoad = cast<IntrinsicInst>(TrueVal);
|
||||
if (isa<UndefValue>(MaskedLoad->getArgOperand(3)))
|
||||
MaskedLoad->setArgOperand(3, FalseVal /* Zero */);
|
||||
return replaceInstUsesWith(SI, MaskedLoad);
|
||||
}
|
||||
|
||||
Value *Mask;
|
||||
if (match(TrueVal, m_Zero()) &&
|
||||
match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
|
||||
m_CombineOr(m_Undef(), m_Zero())))) {
|
||||
// We can remove the select by ensuring the load zeros all lanes the
|
||||
// select would have. We determine this by proving there is no overlap
|
||||
// between the load and select masks.
|
||||
// (i.e (load_mask & select_mask) == 0 == no overlap)
|
||||
bool CanMergeSelectIntoLoad = false;
|
||||
if (Value *V = SimplifyAndInst(CondVal, Mask, SQ.getWithInstruction(&SI)))
|
||||
CanMergeSelectIntoLoad = match(V, m_Zero());
|
||||
|
||||
if (CanMergeSelectIntoLoad) {
|
||||
auto *MaskedLoad = cast<IntrinsicInst>(FalseVal);
|
||||
if (isa<UndefValue>(MaskedLoad->getArgOperand(3)))
|
||||
MaskedLoad->setArgOperand(3, TrueVal /* Zero */);
|
||||
return replaceInstUsesWith(SI, MaskedLoad);
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
98
test/Transforms/InstCombine/select-masked_load.ll
Normal file
98
test/Transforms/InstCombine/select-masked_load.ll
Normal file
@ -0,0 +1,98 @@
|
||||
; RUN: opt < %s -instcombine -S | FileCheck %s
|
||||
|
||||
; Fold zeroing of inactive lanes into the load's passthrough parameter.
|
||||
define <4 x float> @masked_load_and_zero_inactive_1(<4 x float>* %ptr, <4 x i1> %mask) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_1(
|
||||
; CHECK: %load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %mask, <4 x float> zeroinitializer)
|
||||
; CHECK-NEXT: ret <4 x float> %load
|
||||
%load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
%masked = select <4 x i1> %mask, <4 x float> %load, <4 x float> zeroinitializer
|
||||
ret <4 x float> %masked
|
||||
}
|
||||
|
||||
; As above but reuse the load's existing passthrough.
|
||||
define <4 x i32> @masked_load_and_zero_inactive_2(<4 x i32>* %ptr, <4 x i1> %mask) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_2(
|
||||
; CHECK: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: ret <4 x i32> %load
|
||||
%load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
|
||||
%masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer
|
||||
ret <4 x i32> %masked
|
||||
}
|
||||
|
||||
; No transform when the load's passthrough cannot be reused or altered.
|
||||
define <4 x i32> @masked_load_and_zero_inactive_3(<4 x i32>* %ptr, <4 x i1> %mask, <4 x i32> %passthrough) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_3(
|
||||
; CHECK: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough)
|
||||
; CHECK-NEXT: %masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: ret <4 x i32> %masked
|
||||
%load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough)
|
||||
%masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer
|
||||
ret <4 x i32> %masked
|
||||
}
|
||||
|
||||
; Remove redundant select when its mask doesn't overlap with the load mask.
|
||||
define <4 x i32> @masked_load_and_zero_inactive_4(<4 x i32>* %ptr, <4 x i1> %inv_mask) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_4(
|
||||
; CHECK: %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: ret <4 x i32> %load
|
||||
%mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
%load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> undef)
|
||||
%masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load
|
||||
ret <4 x i32> %masked
|
||||
}
|
||||
|
||||
; As above but reuse the load's existing passthrough.
|
||||
define <4 x i32> @masked_load_and_zero_inactive_5(<4 x i32>* %ptr, <4 x i1> %inv_mask) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_5(
|
||||
; CHECK: %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: ret <4 x i32> %load
|
||||
%mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
%load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
|
||||
%masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load
|
||||
ret <4 x i32> %masked
|
||||
}
|
||||
|
||||
; No transform when the load's passthrough cannot be reused or altered.
|
||||
define <4 x i32> @masked_load_and_zero_inactive_6(<4 x i32>* %ptr, <4 x i1> %inv_mask, <4 x i32> %passthrough) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_6(
|
||||
; CHECK: %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough)
|
||||
; CHECK-NEXT: %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load
|
||||
; CHECK-NEXT: ret <4 x i32> %masked
|
||||
%mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
%load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough)
|
||||
%masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load
|
||||
ret <4 x i32> %masked
|
||||
}
|
||||
|
||||
; No transform when select and load masks have no relation.
|
||||
define <4 x i32> @masked_load_and_zero_inactive_7(<4 x i32>* %ptr, <4 x i1> %mask1, <4 x i1> %mask2) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_7(
|
||||
; CHECK: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask1, <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: %masked = select <4 x i1> %mask2, <4 x i32> zeroinitializer, <4 x i32> %load
|
||||
; CHECK-NEXT: ret <4 x i32> %masked
|
||||
%load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask1, <4 x i32> zeroinitializer)
|
||||
%masked = select <4 x i1> %mask2, <4 x i32> zeroinitializer, <4 x i32> %load
|
||||
ret <4 x i32> %masked
|
||||
}
|
||||
|
||||
; A more complex case where we can prove the select mask is a subset of the
|
||||
; load's inactive lanes and thus the load's passthrough takes effect.
|
||||
define <4 x float> @masked_load_and_zero_inactive_8(<4 x float>* %ptr, <4 x i1> %inv_mask, <4 x i1> %cond) {
|
||||
; CHECK-LABEL: @masked_load_and_zero_inactive_8(
|
||||
; CHECK: %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
; CHECK-NEXT: %pg = and <4 x i1> %mask, %cond
|
||||
; CHECK-NEXT: %load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %pg, <4 x float> zeroinitializer)
|
||||
; CHECK-NEXT: ret <4 x float> %load
|
||||
%mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
|
||||
%pg = and <4 x i1> %mask, %cond
|
||||
%load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %pg, <4 x float> undef)
|
||||
%masked = select <4 x i1> %inv_mask, <4 x float> zeroinitializer, <4 x float> %load
|
||||
ret <4 x float> %masked
|
||||
}
|
||||
|
||||
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
|
||||
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
|
@ -12,17 +12,16 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
|
||||
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
|
||||
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -30,7 +29,7 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
|
||||
; CHECK: .lr.ph:
|
||||
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]]
|
||||
; CHECK: ._crit_edge:
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
@ -60,27 +59,25 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD1]], <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
|
||||
; CHECK-NEXT: [[TMP12]] = add i32 [[TMP11]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD1]])
|
||||
; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]]
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
|
||||
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
|
||||
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
|
||||
; CHECK: scalar.ph:
|
||||
@ -88,7 +85,7 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
|
||||
; CHECK: .lr.ph:
|
||||
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
|
||||
; CHECK: ._crit_edge:
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
|
||||
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
|
@ -252,19 +252,18 @@ define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 {
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
|
||||
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
|
||||
; CHECK: for.cond.cleanup:
|
||||
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: ret i32 [[R_0_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
@ -394,19 +393,18 @@ define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 {
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4]] = add i16 [[TMP3]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]])
|
||||
; CHECK-NEXT: [[TMP3]] = add i16 [[TMP2]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
|
||||
; CHECK: for.cond.cleanup:
|
||||
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: ret i16 [[R_0_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
@ -488,19 +486,18 @@ define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 {
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]])
|
||||
; CHECK-NEXT: [[TMP3]] = add i8 [[TMP2]], [[VEC_PHI]]
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
|
||||
; CHECK: for.cond.cleanup:
|
||||
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: ret i8 [[R_0_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
|
Loading…
x
Reference in New Issue
Block a user