1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00
llvm-mirror/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
Simonas Kazlauskas c1d491f5a6 Support {S,U}REMEqFold before legalization
This allows these optimisations to apply to e.g. `urem i16` directly
before `urem` is promoted to i32 on architectures where i16 operations
are not intrinsically legal (such as on Aarch64). The legalization then
later can happen more directly and generated code gets a chance to avoid
wasting time on computing results in types wider than necessary, in the end.

Seems like mostly an improvement in terms of results at least as far as x86_64 and aarch64 are concerned, with a few regressions here and there. It also helps in preventing regressions in changes like {D87976}.

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D88785
2021-04-01 01:35:41 +03:00

233 lines
8.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
; Odd divisor
define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_25:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: mov w9, #47185
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: movk w9, #1310, lsl #16
; CHECK-NEXT: mov w10, #28834
; CHECK-NEXT: movk w10, #2621, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: dup v3.4s, w10
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: cmhs v0.4s, v3.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
; Even divisors
define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_100:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: mov w9, #47184
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: movk w9, #1310, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: mov w10, #23592
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: movk w10, #655, lsl #16
; CHECK-NEXT: shl v0.4s, v2.4s, #30
; CHECK-NEXT: ushr v1.4s, v2.4s, #2
; CHECK-NEXT: dup v3.4s, w10
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
; Negative divisors should be negated, and thus this is still splat vectors.
; Odd divisor
define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_neg25:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: mov w9, #47185
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: movk w9, #1310, lsl #16
; CHECK-NEXT: mov w10, #28834
; CHECK-NEXT: movk w10, #2621, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: dup v3.4s, w10
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: cmhs v0.4s, v3.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
; Even divisors
define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_neg100:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: mov w9, #47184
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: movk w9, #1310, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: mov w10, #23592
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: movk w10, #655, lsl #16
; CHECK-NEXT: shl v0.4s, v2.4s, #30
; CHECK-NEXT: ushr v1.4s, v2.4s, #2
; CHECK-NEXT: dup v3.4s, w10
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
;------------------------------------------------------------------------------;
; Comparison constant has undef elements.
;------------------------------------------------------------------------------;
define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_odd_undef1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
; CHECK-NEXT: sshr v3.4s, v2.4s, #3
; CHECK-NEXT: movi v1.4s, #25
; CHECK-NEXT: usra v3.4s, v2.4s, #31
; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 undef, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_undef1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
; CHECK-NEXT: sshr v3.4s, v2.4s, #5
; CHECK-NEXT: movi v1.4s, #100
; CHECK-NEXT: usra v3.4s, v2.4s, #31
; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 undef, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
;------------------------------------------------------------------------------;
; Negative tests
;------------------------------------------------------------------------------;
define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_one_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_one_ne:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
; We can lower remainder of division by powers of two much better elsewhere.
define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_pow2:
; CHECK: // %bb.0:
; CHECK-NEXT: sshr v1.4s, v0.4s, #31
; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: usra v2.4s, v1.4s, #28
; CHECK-NEXT: bic v2.4s, #15
; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
; We could lower remainder of division by INT_MIN much better elsewhere.
define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_int_min:
; CHECK: // %bb.0:
; CHECK-NEXT: sshr v1.4s, v0.4s, #31
; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: movi v3.4s, #128, lsl #24
; CHECK-NEXT: usra v2.4s, v1.4s, #1
; CHECK-NEXT: and v1.16b, v2.16b, v3.16b
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
; We could lower remainder of division by all-ones much better elsewhere.
define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_allones:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}