mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[AArch64][CostModel] Fix cost for mul <2 x i64>
This was modeled to have a cost of 1, but since we do not have a MUL.2d this is scalarized into vector inserts/extracts and scalar muls. Motivating precommitted test is test/Transforms/SLPVectorizer/AArch64/mul.ll, which we don't want to SLP vectorize. Test Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll unfortunately needed changing, but the reason is documented in LoopVectorize.cpp:6855: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. which I will address next as a follow up of this. Differential Revision: https://reviews.llvm.org/D92208
This commit is contained in:
parent
e06026157d
commit
19add0d39d
@ -644,8 +644,20 @@ int AArch64TTIImpl::getArithmeticInstrCost(
|
||||
}
|
||||
return Cost;
|
||||
|
||||
case ISD::ADD:
|
||||
case ISD::MUL:
|
||||
if (LT.second != MVT::v2i64)
|
||||
return (Cost + 1) * LT.first;
|
||||
// Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
|
||||
// as elements are extracted from the vectors and the muls scalarized.
|
||||
// As getScalarizationOverhead is a bit too pessimistic, we estimate the
|
||||
// cost for a i64 vector directly here, which is:
|
||||
// - four i64 extracts,
|
||||
// - two i64 inserts, and
|
||||
// - two muls.
|
||||
// So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
|
||||
// LT.first = 2 the cost is 16.
|
||||
return LT.first * 8;
|
||||
case ISD::ADD:
|
||||
case ISD::XOR:
|
||||
case ISD::OR:
|
||||
case ISD::AND:
|
||||
|
@ -113,7 +113,7 @@ define <8 x i32> @t12(<8 x i32> %a, <8 x i32> %b) {
|
||||
|
||||
define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) {
|
||||
; THROUGHPUT-LABEL: 't13'
|
||||
; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = mul nsw <2 x i64> %a, %b
|
||||
; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %1 = mul nsw <2 x i64> %a, %b
|
||||
; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %1
|
||||
;
|
||||
%1 = mul nsw <2 x i64> %a, %b
|
||||
@ -122,7 +122,7 @@ define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) {
|
||||
|
||||
define <4 x i64> @t14(<4 x i64> %a, <4 x i64> %b) {
|
||||
; THROUGHPUT-LABEL: 't14'
|
||||
; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = mul nsw <4 x i64> %a, %b
|
||||
; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %1 = mul nsw <4 x i64> %a, %b
|
||||
; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
|
||||
;
|
||||
%1 = mul nsw <4 x i64> %a, %b
|
||||
|
@ -9,8 +9,8 @@
|
||||
; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
|
||||
|
||||
; CM: LV: Scalar loop costs: 7.
|
||||
; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
|
||||
; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1
|
||||
; CM: LV: Found an estimated cost of 19 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
|
||||
; CM-NEXT: LV: Found an estimated cost of 19 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1
|
||||
|
||||
; Check that the extractvalue operands are actually free in vector code.
|
||||
|
||||
|
@ -27,8 +27,7 @@ target triple = "aarch64--linux-gnu"
|
||||
; str q0, [x0]
|
||||
; ret
|
||||
;
|
||||
; but if we don't SLP vectorise these examples we get this which is smaller
|
||||
; and faster:
|
||||
; If we don't SLP vectorise but scalarize this we get this instead:
|
||||
;
|
||||
; ldp x8, x9, [x1]
|
||||
; ldp x10, x11, [x0]
|
||||
@ -37,20 +36,19 @@ target triple = "aarch64--linux-gnu"
|
||||
; stp x8, x9, [x0]
|
||||
; ret
|
||||
;
|
||||
; FIXME: don't SLP vectorise this.
|
||||
|
||||
define void @mul(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) {
|
||||
; CHECK-LABEL: @mul(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[A]] to <2 x i64>*
|
||||
; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[B:%.*]], align 8
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[A:%.*]], align 8
|
||||
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
|
||||
; CHECK-NEXT: store i64 [[MUL]], i64* [[A]], align 8
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX2]], align 8
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
|
||||
; CHECK-NEXT: [[MUL4:%.*]] = mul nsw i64 [[TMP3]], [[TMP2]]
|
||||
; CHECK-NEXT: store i64 [[MUL4]], i64* [[ARRAYIDX3]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
@ -79,16 +77,18 @@ entry:
|
||||
define void @mac(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) {
|
||||
; CHECK-LABEL: @mac(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[TMP4]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[A]] to <2 x i64>*
|
||||
; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[B:%.*]], align 8
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[A:%.*]], align 8
|
||||
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX2]], align 8
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
|
||||
; CHECK-NEXT: [[MUL4:%.*]] = mul nsw i64 [[TMP3]], [[TMP2]]
|
||||
; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[MUL]], [[TMP0]]
|
||||
; CHECK-NEXT: store i64 [[ADD]], i64* [[A]], align 8
|
||||
; CHECK-NEXT: [[ADD9:%.*]] = add nsw i64 [[MUL4]], [[TMP2]]
|
||||
; CHECK-NEXT: store i64 [[ADD9]], i64* [[ARRAYIDX3]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
Loading…
Reference in New Issue
Block a user