mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
[PowerPC] Combine 64-bit bswap(load) without LDBRX
When targeting CPUs that don't have LDBRX, we end up producing code that is very inefficient and large for this common idiom. This patch just optimizes it two 32-bit LWBRX instructions along with a merge. This fixes https://bugs.llvm.org/show_bug.cgi?id=49610 Differential revision: https://reviews.llvm.org/D104836
This commit is contained in:
parent
d354f79d7d
commit
377c8332ec
@ -15202,13 +15202,17 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ISD::BSWAP:
|
||||
case ISD::BSWAP: {
|
||||
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
|
||||
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
|
||||
N->getOperand(0).hasOneUse() &&
|
||||
// For subtargets without LDBRX, we can still do better than the default
|
||||
// expansion even for 64-bit BSWAP (LOAD).
|
||||
bool Is64BitBswapOn64BitTgt =
|
||||
Subtarget.isPPC64() && N->getValueType(0) == MVT::i64;
|
||||
bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) &&
|
||||
N->getOperand(0).hasOneUse();
|
||||
if (IsSingleUseNormalLd &&
|
||||
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
|
||||
(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
|
||||
N->getValueType(0) == MVT::i64))) {
|
||||
(Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
|
||||
SDValue Load = N->getOperand(0);
|
||||
LoadSDNode *LD = cast<LoadSDNode>(Load);
|
||||
// Create the byte-swapping load.
|
||||
@ -15239,7 +15243,32 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
// Return N so it doesn't get rechecked!
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
break;
|
||||
// Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
|
||||
// before legalization so that the BUILD_PAIR is handled correctly.
|
||||
if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
|
||||
!IsSingleUseNormalLd)
|
||||
return SDValue();
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N->getOperand(0));
|
||||
|
||||
// Can't split volatile or atomic loads.
|
||||
if (!LD->isSimple())
|
||||
return SDValue();
|
||||
SDValue BasePtr = LD->getBasePtr();
|
||||
SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
|
||||
LD->getPointerInfo(), LD->getAlignment());
|
||||
Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);
|
||||
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
|
||||
DAG.getIntPtrConstant(4, dl));
|
||||
SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
|
||||
LD->getPointerInfo(), LD->getAlignment());
|
||||
Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi);
|
||||
SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo);
|
||||
SDValue TF =
|
||||
DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
||||
Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1));
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF);
|
||||
return Res;
|
||||
}
|
||||
case PPCISD::VCMP:
|
||||
// If a VCMP_rec node already exists with exactly the same operands as this
|
||||
// node, use its result instead of this node (VCMP_rec computes both a CR6
|
||||
|
@ -101,6 +101,8 @@ define i16 @LHBRX(i8* %ptr, i32 %off) {
|
||||
ret i16 %tmp6
|
||||
}
|
||||
|
||||
; TODO: combine the bswap feeding a store on subtargets
|
||||
; that do not have an STDBRX.
|
||||
define void @STDBRX(i64 %i, i8* %ptr, i64 %off) {
|
||||
; PWR7_32-LABEL: STDBRX:
|
||||
; PWR7_32: # %bb.0:
|
||||
@ -149,19 +151,11 @@ define i64 @LDBRX(i8* %ptr, i64 %off) {
|
||||
;
|
||||
; X64-LABEL: LDBRX:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: ldx r4, r3, r4
|
||||
; X64-NEXT: rotldi r5, r4, 16
|
||||
; X64-NEXT: rotldi r3, r4, 8
|
||||
; X64-NEXT: rldimi r3, r5, 8, 48
|
||||
; X64-NEXT: rotldi r5, r4, 24
|
||||
; X64-NEXT: rldimi r3, r5, 16, 40
|
||||
; X64-NEXT: rotldi r5, r4, 32
|
||||
; X64-NEXT: rldimi r3, r5, 24, 32
|
||||
; X64-NEXT: rotldi r5, r4, 48
|
||||
; X64-NEXT: rldimi r3, r5, 40, 16
|
||||
; X64-NEXT: rotldi r5, r4, 56
|
||||
; X64-NEXT: rldimi r3, r5, 48, 8
|
||||
; X64-NEXT: rldimi r3, r4, 56, 0
|
||||
; X64-NEXT: li r5, 4
|
||||
; X64-NEXT: lwbrx r6, r3, r4
|
||||
; X64-NEXT: add r3, r3, r4
|
||||
; X64-NEXT: lwbrx r3, r3, r5
|
||||
; X64-NEXT: rldimi r3, r6, 32, 0
|
||||
; X64-NEXT: blr
|
||||
;
|
||||
; PWR7_64-LABEL: LDBRX:
|
||||
|
54
test/CodeGen/PowerPC/ld-bswap64-no-ldbrx.ll
Normal file
54
test/CodeGen/PowerPC/ld-bswap64-no-ldbrx.ll
Normal file
@ -0,0 +1,54 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=powerpc64-- -mcpu=pwr5 -verify-machineinstrs < %s | \
|
||||
; RUN: FileCheck %s
|
||||
define void @bs(i64* %p) {
|
||||
; CHECK-LABEL: bs:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: li 4, 4
|
||||
; CHECK-NEXT: lwbrx 5, 0, 3
|
||||
; CHECK-NEXT: lwbrx 4, 3, 4
|
||||
; CHECK-NEXT: rldimi 4, 5, 32, 0
|
||||
; CHECK-NEXT: std 4, 0(3)
|
||||
; CHECK-NEXT: blr
|
||||
%x = load i64, i64* %p, align 8
|
||||
%b = call i64 @llvm.bswap.i64(i64 %x)
|
||||
store i64 %b, i64* %p, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define i64 @volatile_ld(i64* %p) {
|
||||
; CHECK-LABEL: volatile_ld:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: ld 4, 0(3)
|
||||
; CHECK-NEXT: rotldi 5, 4, 16
|
||||
; CHECK-NEXT: rotldi 3, 4, 8
|
||||
; CHECK-NEXT: rldimi 3, 5, 8, 48
|
||||
; CHECK-NEXT: rotldi 5, 4, 24
|
||||
; CHECK-NEXT: rldimi 3, 5, 16, 40
|
||||
; CHECK-NEXT: rotldi 5, 4, 32
|
||||
; CHECK-NEXT: rldimi 3, 5, 24, 32
|
||||
; CHECK-NEXT: rotldi 5, 4, 48
|
||||
; CHECK-NEXT: rldimi 3, 5, 40, 16
|
||||
; CHECK-NEXT: rotldi 5, 4, 56
|
||||
; CHECK-NEXT: rldimi 3, 5, 48, 8
|
||||
; CHECK-NEXT: rldimi 3, 4, 56, 0
|
||||
; CHECK-NEXT: blr
|
||||
%x = load volatile i64, i64* %p, align 8
|
||||
%b = call i64 @llvm.bswap.i64(i64 %x)
|
||||
ret i64 %b
|
||||
}
|
||||
|
||||
define i64 @misaligned_ld(i64* %p) {
|
||||
; CHECK-LABEL: misaligned_ld:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: li 4, 4
|
||||
; CHECK-NEXT: lwbrx 5, 0, 3
|
||||
; CHECK-NEXT: lwbrx 3, 3, 4
|
||||
; CHECK-NEXT: rldimi 3, 5, 32, 0
|
||||
; CHECK-NEXT: blr
|
||||
%x = load i64, i64* %p, align 1
|
||||
%b = call i64 @llvm.bswap.i64(i64 %x)
|
||||
ret i64 %b
|
||||
}
|
||||
|
||||
declare i64 @llvm.bswap.i64(i64) #2
|
Loading…
Reference in New Issue
Block a user