mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[LoopVectorize] Don't preserve nsw/nuw flags on shrunken ops.
If we're shrinking a binary operation, it may be the case that the new operations wraps where the old didn't. If this happens, the behavior should be well-defined. So, we can't always carry wrapping flags with us when we shrink operations. If we do, we get incorrect optimizations in cases like: void foo(const unsigned char *from, unsigned char *to, int n) { for (int i = 0; i < n; i++) to[i] = from[i] - 128; } which gets optimized to: void foo(const unsigned char *from, unsigned char *to, int n) { for (int i = 0; i < n; i++) to[i] = from[i] | 128; } Because: - InstCombine turned `sub i32 %from.i, 128` into `add nuw nsw i32 %from.i, 128`. - LoopVectorize vectorized the add to be `add nuw nsw <16 x i8>` with a vector full of `i8 128`s - InstCombine took advantage of the fact that the newly-shrunken add "couldn't wrap", and changed the `add` to an `or`. InstCombine seems happy to figure out whether we can add nuw/nsw on its own, so I just decided to drop the flags. There are already a number of places in LoopVectorize where we rely on InstCombine to clean up. llvm-svn: 305053
This commit is contained in:
parent
4aa0695c53
commit
5cb9a3f362
@ -360,9 +360,9 @@ public:
|
||||
/// Copy I's fast-math flags
|
||||
void copyFastMathFlags(const Instruction *I);
|
||||
|
||||
/// Convenience method to copy supported wrapping, exact, and fast-math flags
|
||||
/// from V to this instruction.
|
||||
void copyIRFlags(const Value *V);
|
||||
/// Convenience method to copy supported exact, fast-math, and (optionally)
|
||||
/// wrapping flags from V to this instruction.
|
||||
void copyIRFlags(const Value *V, bool IncludeWrapFlags = true);
|
||||
|
||||
/// Logical 'and' of any supported wrapping, exact, and fast-math flags of
|
||||
/// V and this instruction.
|
||||
|
@ -216,10 +216,10 @@ void Instruction::copyFastMathFlags(const Instruction *I) {
|
||||
copyFastMathFlags(I->getFastMathFlags());
|
||||
}
|
||||
|
||||
void Instruction::copyIRFlags(const Value *V) {
|
||||
void Instruction::copyIRFlags(const Value *V, bool IncludeWrapFlags) {
|
||||
// Copy the wrapping flags.
|
||||
if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
|
||||
if (isa<OverflowingBinaryOperator>(this)) {
|
||||
if (IncludeWrapFlags && isa<OverflowingBinaryOperator>(this)) {
|
||||
if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
|
||||
setHasNoSignedWrap(OB->hasNoSignedWrap());
|
||||
setHasNoUnsignedWrap(OB->hasNoUnsignedWrap());
|
||||
}
|
||||
|
@ -3814,7 +3814,11 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
|
||||
if (auto *BO = dyn_cast<BinaryOperator>(I)) {
|
||||
NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
|
||||
ShrinkOperand(BO->getOperand(1)));
|
||||
cast<BinaryOperator>(NewI)->copyIRFlags(I);
|
||||
|
||||
// Any wrapping introduced by shrinking this operation shouldn't be
|
||||
// considered undefined behavior. So, we can't unconditionally copy
|
||||
// arithmetic wrapping flags to NewI.
|
||||
cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
|
||||
} else if (auto *CI = dyn_cast<ICmpInst>(I)) {
|
||||
NewI =
|
||||
B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
|
||||
|
@ -5,7 +5,7 @@ target triple = "aarch64"
|
||||
|
||||
; CHECK-LABEL: @add_a(
|
||||
; CHECK: load <16 x i8>, <16 x i8>*
|
||||
; CHECK: add nuw nsw <16 x i8>
|
||||
; CHECK: add <16 x i8>
|
||||
; CHECK: store <16 x i8>
|
||||
; Function Attrs: nounwind
|
||||
define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
|
||||
@ -31,9 +31,37 @@ for.body: ; preds = %entry, %for.body
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
|
||||
; working with.
|
||||
; CHECK-LABEL: @add_a1(
|
||||
; CHECK: load <16 x i8>, <16 x i8>*
|
||||
; CHECK: add nuw nsw <16 x i8>
|
||||
; CHECK: store <16 x i8>
|
||||
; Function Attrs: nounwind
|
||||
define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
|
||||
entry:
|
||||
%cmp8 = icmp sgt i32 %len, 0
|
||||
br i1 %cmp8, label %for.body, label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body, %entry
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
|
||||
%0 = load i8, i8* %arrayidx
|
||||
%add = add nuw nsw i8 %0, 2
|
||||
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
|
||||
store i8 %add, i8* %arrayidx3
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %len
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @add_b(
|
||||
; CHECK: load <8 x i16>, <8 x i16>*
|
||||
; CHECK: add nuw nsw <8 x i16>
|
||||
; CHECK: add <8 x i16>
|
||||
; CHECK: store <8 x i16>
|
||||
; Function Attrs: nounwind
|
||||
define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
|
||||
@ -61,7 +89,7 @@ for.body: ; preds = %entry, %for.body
|
||||
|
||||
; CHECK-LABEL: @add_c(
|
||||
; CHECK: load <8 x i8>, <8 x i8>*
|
||||
; CHECK: add nuw nsw <8 x i16>
|
||||
; CHECK: add <8 x i16>
|
||||
; CHECK: store <8 x i16>
|
||||
; Function Attrs: nounwind
|
||||
define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
|
||||
@ -116,12 +144,12 @@ for.body: ; preds = %entry, %for.body
|
||||
; CHECK-LABEL: @add_e(
|
||||
; CHECK: load <16 x i8>
|
||||
; CHECK: shl <16 x i8>
|
||||
; CHECK: add nuw nsw <16 x i8>
|
||||
; CHECK: add <16 x i8>
|
||||
; CHECK: or <16 x i8>
|
||||
; CHECK: mul nuw nsw <16 x i8>
|
||||
; CHECK: mul <16 x i8>
|
||||
; CHECK: and <16 x i8>
|
||||
; CHECK: xor <16 x i8>
|
||||
; CHECK: mul nuw nsw <16 x i8>
|
||||
; CHECK: mul <16 x i8>
|
||||
; CHECK: store <16 x i8>
|
||||
define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
|
||||
entry:
|
||||
@ -162,12 +190,12 @@ for.body: ; preds = %for.body, %for.body
|
||||
; CHECK: load <8 x i16>
|
||||
; CHECK: trunc <8 x i16>
|
||||
; CHECK: shl <8 x i8>
|
||||
; CHECK: add nsw <8 x i8>
|
||||
; CHECK: add <8 x i8>
|
||||
; CHECK: or <8 x i8>
|
||||
; CHECK: mul nuw nsw <8 x i8>
|
||||
; CHECK: mul <8 x i8>
|
||||
; CHECK: and <8 x i8>
|
||||
; CHECK: xor <8 x i8>
|
||||
; CHECK: mul nuw nsw <8 x i8>
|
||||
; CHECK: mul <8 x i8>
|
||||
; CHECK: store <8 x i8>
|
||||
define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
|
||||
entry:
|
||||
|
Loading…
Reference in New Issue
Block a user