1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

[InstCombine] Added vector demanded bits support for SSE4A EXTRQ/INSERTQ instructions

The SSE4A instructions EXTRQ/INSERTQ only use the lower 64-bits (or less) for many of their input vector operands and all of them have undefined upper 64-bits results.

Differential Revision: http://reviews.llvm.org/D12680

llvm-svn: 247934
This commit is contained in:
Simon Pilgrim 2015-09-17 20:32:45 +00:00
parent 4a51a0af8a
commit 76acca71c9
3 changed files with 327 additions and 126 deletions

View File

@ -527,6 +527,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
if (Changed) return II;
}
auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth)
{
APInt UndefElts(Width, 0);
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
};
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::objectsize: {
@ -975,6 +982,54 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return ReplaceInstUsesWith(*II, V);
break;
case Intrinsic::x86_sse4a_extrq: {
// EXTRQ uses only the lowest 64-bits of the first 128-bit vector
// operands and the lowest 16-bits of the second.
Value *Op0 = II->getArgOperand(0);
Value *Op1 = II->getArgOperand(1);
unsigned VWidth0 = Op0->getType()->getVectorNumElements();
unsigned VWidth1 = Op1->getType()->getVectorNumElements();
assert(VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes");
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
II->setArgOperand(0, V);
return II;
}
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
II->setArgOperand(1, V);
return II;
}
break;
}
case Intrinsic::x86_sse4a_extrqi: {
// EXTRQI uses only the lowest 64-bits of the first 128-bit vector
// operand.
Value *Op = II->getArgOperand(0);
unsigned VWidth = Op->getType()->getVectorNumElements();
assert(VWidth == 2 && "Unexpected operand size");
if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) {
II->setArgOperand(0, V);
return II;
}
break;
}
case Intrinsic::x86_sse4a_insertq: {
// INSERTQ uses only the lowest 64-bits of the first 128-bit vector
// operand.
Value *Op = II->getArgOperand(0);
unsigned VWidth = Op->getType()->getVectorNumElements();
assert(VWidth == 2 && "Unexpected operand size");
if (Value *V = SimplifyDemandedVectorEltsLow(Op, VWidth, 1)) {
II->setArgOperand(0, V);
return II;
}
break;
}
case Intrinsic::x86_sse4a_insertqi: {
// insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
// ones undef
@ -1051,6 +1106,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
}
}
// INSERTQI uses only the lowest 64-bits of the first two 128-bit vector
// operands.
Value *Op0 = II->getArgOperand(0);
Value *Op1 = II->getArgOperand(1);
unsigned VWidth0 = Op0->getType()->getVectorNumElements();
unsigned VWidth1 = Op1->getType()->getVectorNumElements();
assert(VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes");
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
II->setArgOperand(0, V);
return II;
}
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
II->setArgOperand(1, V);
return II;
}
break;
}

View File

@ -1237,6 +1237,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
// like undef&0. The result is known zero, not undef.
UndefElts &= UndefElts2;
break;
// SSE4A instructions leave the upper 64-bits of the 128-bit result
// in an undefined state.
case Intrinsic::x86_sse4a_extrq:
case Intrinsic::x86_sse4a_extrqi:
case Intrinsic::x86_sse4a_insertq:
case Intrinsic::x86_sse4a_insertqi:
UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2);
break;
}
break;
}

View File

@ -121,5 +121,124 @@ define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
ret <2 x i64> %1
}
;
; Vector Demanded Bits
;
define <2 x i64> @test_extrq_arg0(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_extrq_arg0
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %y) nounwind
ret <2 x i64> %2
}
define <2 x i64> @test_extrq_arg1(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_extrq_arg1
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
ret <2 x i64> %2
}
define <2 x i64> @test_extrq_args01(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_extrq_args01
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = shufflevector <16 x i8> %y, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%3 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %1, <16 x i8> %2) nounwind
ret <2 x i64> %3
}
define <2 x i64> @test_extrq_ret(<2 x i64> %x, <16 x i8> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_extrq_ret
; CHECK-NEXT: ret <2 x i64> undef
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %y) nounwind
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %2
}
define <2 x i64> @test_extrqi_arg0(<2 x i64> %x) nounwind uwtable ssp {
; CHECK-LABEL: @test_extrqi_arg0
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %1, i8 3, i8 2)
ret <2 x i64> %2
}
define <2 x i64> @test_extrqi_ret(<2 x i64> %x) nounwind uwtable ssp {
; CHECK-LABEL: @test_extrqi_ret
; CHECK-NEXT: ret <2 x i64> undef
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2) nounwind
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %2
}
define <2 x i64> @test_insertq_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_insertq_arg0
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %1, <2 x i64> %y) nounwind
ret <2 x i64> %2
}
define <2 x i64> @test_insertq_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_insertq_ret
; CHECK-NEXT: ret <2 x i64> undef
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %2
}
define <2 x i64> @test_insertqi_arg0(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_insertqi_arg0
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %y, i8 3, i8 2) nounwind
ret <2 x i64> %2
}
define <2 x i64> @test_insertqi_arg1(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_insertqi_arg1
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %1, i8 3, i8 2) nounwind
ret <2 x i64> %2
}
define <2 x i64> @test_insertqi_args01(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_insertqi_args01
; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2)
; CHECK-NEXT: ret <2 x i64> %1
%1 = shufflevector <2 x i64> %x, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = shufflevector <2 x i64> %y, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%3 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %2, i8 3, i8 2) nounwind
ret <2 x i64> %3
}
define <2 x i64> @test_insertqi_ret(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; CHECK-LABEL: @test_insertqi_ret
; CHECK-NEXT: ret <2 x i64> undef
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 3, i8 2) nounwind
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %2
}
; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrq
declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
; CHECK: declare <2 x i64> @llvm.x86.sse4a.extrqi
declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertq
declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind