mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
Fix the alignment requirements of several unpck and shuf instructions.
Generalize isPSHUFDMask and add a unary SHUFPD pattern so that SHUFPD's memory operand alignment can be tested as well, with a fix to avoid breaking MMX's use of isPSHUFDMask. llvm-svn: 40756
This commit is contained in:
parent
7d1e35e6d1
commit
1afde4166e
@ -1578,7 +1578,7 @@ static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
|
||||
bool X86::isPSHUFDMask(SDNode *N) {
|
||||
assert(N->getOpcode() == ISD::BUILD_VECTOR);
|
||||
|
||||
if (N->getNumOperands() != 4)
|
||||
if (N->getNumOperands() != 2 && N->getNumOperands() != 4)
|
||||
return false;
|
||||
|
||||
// Check if the value doesn't reference the second vector.
|
||||
@ -1586,7 +1586,7 @@ bool X86::isPSHUFDMask(SDNode *N) {
|
||||
SDOperand Arg = N->getOperand(i);
|
||||
if (Arg.getOpcode() == ISD::UNDEF) continue;
|
||||
assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
|
||||
if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
|
||||
if (cast<ConstantSDNode>(Arg)->getValue() >= e)
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2767,7 +2767,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
|
||||
|
||||
// If VT is integer, try PSHUF* first, then SHUFP*.
|
||||
if (MVT::isInteger(VT)) {
|
||||
if (X86::isPSHUFDMask(PermMask.Val) ||
|
||||
// MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically
|
||||
// possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
|
||||
if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) &&
|
||||
X86::isPSHUFDMask(PermMask.Val)) ||
|
||||
X86::isPSHUFHWMask(PermMask.Val) ||
|
||||
X86::isPSHUFLWMask(PermMask.Val)) {
|
||||
if (V2.getOpcode() != ISD::UNDEF)
|
||||
|
@ -808,7 +808,7 @@ let isTwoAddress = 1 in {
|
||||
"shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
|
||||
[(set VR128:$dst,
|
||||
(v4f32 (vector_shuffle
|
||||
VR128:$src1, (load addr:$src2),
|
||||
VR128:$src1, (memopv4f32 addr:$src2),
|
||||
SHUFP_shuffle_mask:$src3)))]>;
|
||||
|
||||
let AddedComplexity = 10 in {
|
||||
@ -824,7 +824,7 @@ let isTwoAddress = 1 in {
|
||||
"unpckhps\t{$src2, $dst|$dst, $src2}",
|
||||
[(set VR128:$dst,
|
||||
(v4f32 (vector_shuffle
|
||||
VR128:$src1, (load addr:$src2),
|
||||
VR128:$src1, (memopv4f32 addr:$src2),
|
||||
UNPCKH_shuffle_mask)))]>;
|
||||
|
||||
def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
|
||||
@ -839,7 +839,7 @@ let isTwoAddress = 1 in {
|
||||
"unpcklps\t{$src2, $dst|$dst, $src2}",
|
||||
[(set VR128:$dst,
|
||||
(v4f32 (vector_shuffle
|
||||
VR128:$src1, (load addr:$src2),
|
||||
VR128:$src1, (memopv4f32 addr:$src2),
|
||||
UNPCKL_shuffle_mask)))]>;
|
||||
} // AddedComplexity
|
||||
} // isTwoAddress
|
||||
@ -1561,7 +1561,7 @@ let isTwoAddress = 1 in {
|
||||
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
|
||||
[(set VR128:$dst,
|
||||
(v2f64 (vector_shuffle
|
||||
VR128:$src1, (load addr:$src2),
|
||||
VR128:$src1, (memopv2f64 addr:$src2),
|
||||
SHUFP_shuffle_mask:$src3)))]>;
|
||||
|
||||
let AddedComplexity = 10 in {
|
||||
@ -1577,7 +1577,7 @@ let isTwoAddress = 1 in {
|
||||
"unpckhpd\t{$src2, $dst|$dst, $src2}",
|
||||
[(set VR128:$dst,
|
||||
(v2f64 (vector_shuffle
|
||||
VR128:$src1, (load addr:$src2),
|
||||
VR128:$src1, (memopv2f64 addr:$src2),
|
||||
UNPCKH_shuffle_mask)))]>;
|
||||
|
||||
def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
|
||||
@ -1592,7 +1592,7 @@ let isTwoAddress = 1 in {
|
||||
"unpcklpd\t{$src2, $dst|$dst, $src2}",
|
||||
[(set VR128:$dst,
|
||||
(v2f64 (vector_shuffle
|
||||
VR128:$src1, (load addr:$src2),
|
||||
VR128:$src1, (memopv2f64 addr:$src2),
|
||||
UNPCKL_shuffle_mask)))]>;
|
||||
} // AddedComplexity
|
||||
} // isTwoAddress
|
||||
@ -1782,7 +1782,7 @@ let isTwoAddress = 1 in {
|
||||
(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
|
||||
"pandn\t{$src2, $dst|$dst, $src2}",
|
||||
[(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
|
||||
(load addr:$src2))))]>;
|
||||
(memopv2i64 addr:$src2))))]>;
|
||||
}
|
||||
|
||||
// SSE2 Integer comparison
|
||||
@ -2419,6 +2419,11 @@ def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef),
|
||||
SHUFP_unary_shuffle_mask:$sm),
|
||||
(SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
|
||||
Requires<[HasSSE1]>;
|
||||
// Special unary SHUFPDrri case.
|
||||
def : Pat<(vector_shuffle (v2f64 VR128:$src1), (undef),
|
||||
SHUFP_unary_shuffle_mask:$sm),
|
||||
(SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
|
||||
Requires<[HasSSE2]>;
|
||||
// Unary v4f32 shuffle with PSHUF* in order to fold a load.
|
||||
def : Pat<(vector_shuffle (memopv4f32 addr:$src1), (undef),
|
||||
SHUFP_unary_shuffle_mask:$sm),
|
||||
@ -2583,13 +2588,13 @@ def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
|
||||
(PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
|
||||
|
||||
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
|
||||
(load addr:$src2))),
|
||||
(memopv2i64 addr:$src2))),
|
||||
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
|
||||
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
|
||||
(load addr:$src2))),
|
||||
(memopv2i64 addr:$src2))),
|
||||
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
|
||||
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
|
||||
(load addr:$src2))),
|
||||
(memopv2i64 addr:$src2))),
|
||||
(PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
|
||||
|
||||
// Use movaps / movups for SSE integer load / store (one byte shorter).
|
||||
|
50
test/CodeGen/X86/sse-align-12.ll
Normal file
50
test/CodeGen/X86/sse-align-12.ll
Normal file
@ -0,0 +1,50 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep unpck | wc -l | grep 2
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep shuf | wc -l | grep 2
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep ps | wc -l | grep 4
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep pd | wc -l | grep 4
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep movup | wc -l | grep 4
|
||||
|
||||
define <4 x float> @a(<4 x float>* %y)
|
||||
{
|
||||
%x = load <4 x float>* %y, align 4
|
||||
%a = extractelement <4 x float> %x, i32 0
|
||||
%b = extractelement <4 x float> %x, i32 1
|
||||
%c = extractelement <4 x float> %x, i32 2
|
||||
%d = extractelement <4 x float> %x, i32 3
|
||||
%p = insertelement <4 x float> undef, float %d, i32 0
|
||||
%q = insertelement <4 x float> %p, float %c, i32 1
|
||||
%r = insertelement <4 x float> %q, float %b, i32 2
|
||||
%s = insertelement <4 x float> %r, float %a, i32 3
|
||||
ret <4 x float> %s
|
||||
}
|
||||
define <4 x float> @b(<4 x float>* %y, <4 x float> %z)
|
||||
{
|
||||
%x = load <4 x float>* %y, align 4
|
||||
%a = extractelement <4 x float> %x, i32 2
|
||||
%b = extractelement <4 x float> %x, i32 3
|
||||
%c = extractelement <4 x float> %z, i32 2
|
||||
%d = extractelement <4 x float> %z, i32 3
|
||||
%p = insertelement <4 x float> undef, float %c, i32 0
|
||||
%q = insertelement <4 x float> %p, float %a, i32 1
|
||||
%r = insertelement <4 x float> %q, float %d, i32 2
|
||||
%s = insertelement <4 x float> %r, float %b, i32 3
|
||||
ret <4 x float> %s
|
||||
}
|
||||
define <2 x double> @c(<2 x double>* %y)
|
||||
{
|
||||
%x = load <2 x double>* %y, align 8
|
||||
%a = extractelement <2 x double> %x, i32 0
|
||||
%c = extractelement <2 x double> %x, i32 1
|
||||
%p = insertelement <2 x double> undef, double %c, i32 0
|
||||
%r = insertelement <2 x double> %p, double %a, i32 1
|
||||
ret <2 x double> %r
|
||||
}
|
||||
define <2 x double> @d(<2 x double>* %y, <2 x double> %z)
|
||||
{
|
||||
%x = load <2 x double>* %y, align 8
|
||||
%a = extractelement <2 x double> %x, i32 1
|
||||
%c = extractelement <2 x double> %z, i32 1
|
||||
%p = insertelement <2 x double> undef, double %c, i32 0
|
||||
%r = insertelement <2 x double> %p, double %a, i32 1
|
||||
ret <2 x double> %r
|
||||
}
|
Loading…
Reference in New Issue
Block a user