1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 11:33:24 +02:00
llvm-mirror/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
Roman Lebedev e0449de48b [Codegen] TargetLowering::prepareUREMEqFold(): x u% C1 ==/!= C2 with tautological C1 u<= C2 (PR35479)
Summary:
This is a preparatory cleanup before i add more
of this fold to deal with comparisons with non-zero.

In essence, the current lowering is:
```
Name: (X % C1) == 0 -> X * C3 <= C4
Pre: (C1 u>> countTrailingZeros(C1)) * C3 == 1
%zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition
%o0 = urem i8 %x, C1
%r = icmp eq i8 %o0, 0
  =>
%zz = and i8 C3, 0 ; and silence it from complaining about said reg
%C4 = -1 /u C1
%n0 = mul i8 %x, C3
%n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right
%n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right
%n3 = or i8 %n1, %n2 ; rotate right
%r = icmp ule i8 %n3, %C4
```
https://rise4fun.com/Alive/oqd

It kinda just works, really no weird edge-cases.
But it isn't all that great for when comparing with non-zero.
In particular, given `(X % C1) == C2`, there will be problems
in the always-false tautological case where `C2 u>= C1`:
https://rise4fun.com/Alive/pH3

That case is tautological, always-false:
```
Name: (X % Y) u>= Y
%o0 = urem i8 %x, %y
%r = icmp uge i8 %o0, %y
  =>
%r = false
```
https://rise4fun.com/Alive/ofu

While we can't/shouldn't get such tautological case normally,
we do deal with non-splat vectors, so unless we want to give up
in this case, we need to fixup/short-circuit such lanes.

There are two lowering variants:
1. We can blend between whatever computed result and the correct tautological result
```
Name: (X % C1) == C2 -> X * C3 <= C4 || false
Pre: (C2 == 0 || C1 u<= C2) && (C1 u>> countTrailingZeros(C1)) * C3 == 1
%zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition
%o0 = urem i8 %x, C1
%r = icmp eq i8 %o0, C2
  =>
%zz = and i8 C3, 0 ; and silence it from complaining about said reg
%C4 = -1 /u C1
%n0 = mul i8 %x, C3
%n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right
%n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right
%n3 = or i8 %n1, %n2 ; rotate right
%is_tautologically_false = icmp ule i8 C1, C2
%res = icmp ule i8 %n3, %C4
%r = select i1 %is_tautologically_false, i1 0, i1 %res
```
https://rise4fun.com/Alive/PjT5
https://rise4fun.com/Alive/1KV

2. We can invert the comparison result
```
Name: (X % C1) == C2 -> X * C3 <= C4 || false
Pre: (C2 == 0 || C1 u<= C2) && (C1 u>> countTrailingZeros(C1)) * C3 == 1
%zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition
%o0 = urem i8 %x, C1
%r = icmp eq i8 %o0, C2
  =>
%zz = and i8 C3, 0 ; and silence it from complaining about said reg
%C4 = -1 /u C1
%n0 = mul i8 %x, C3
%n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right
%n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right
%n3 = or i8 %n1, %n2 ; rotate right
%is_tautologically_false = icmp ule i8 C1, C2
%C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4
%res = icmp ule i8 %n3, %C4_fixed
%r = xor i1 %res, %is_tautologically_false
```
https://rise4fun.com/Alive/2xC
https://rise4fun.com/Alive/jpb5

3. We can expand into `and`/`or`:
https://rise4fun.com/Alive/WGn
https://rise4fun.com/Alive/lcb5

Blend-one is likely better since we avoid having to load the
replacement from constant pool. `xor` is second best since
it's still pretty general. I'm not adding `and`/`or` variants.

Reviewers: RKSimon, craig.topper, spatel

Reviewed By: RKSimon

Subscribers: nick, hiraditya, xbolva00, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70051
2019-11-22 15:16:03 +03:00

98 lines
3.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
; CHECK-LABEL: t0_all_tautological:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1]
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 2>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 3>
ret <4 x i1> %cmp
}
define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
; CHECK-LABEL: t1_all_odd_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: movi d1, #0xffff0000ffff0000
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 3, i32 1, i32 1, i32 9>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 42, i32 0, i32 42>
ret <4 x i1> %cmp
}
define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
; CHECK-LABEL: t1_all_odd_ne:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: movi d1, #0xffff0000ffff0000
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 3, i32 1, i32 1, i32 9>
%cmp = icmp ne <4 x i32> %urem, <i32 0, i32 42, i32 0, i32 42>
ret <4 x i1> %cmp
}
define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind {
; CHECK-LABEL: t2_narrow:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: dup v2.8h, w8
; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h
; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: movi d1, #0xffff0000ffff0000
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%urem = urem <8 x i16> %X, <i16 3, i16 1, i16 1, i16 9, i16 3, i16 1, i16 1, i16 9>
%cmp = icmp eq <8 x i16> %urem, <i16 0, i16 0, i16 42, i16 42, i16 0, i16 0, i16 42, i16 42>
ret <8 x i1> %cmp
}
define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-LABEL: t3_wide:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x9, #-6148914691236517206
; CHECK-NEXT: adrp x11, .LCPI4_0
; CHECK-NEXT: mov x8, v0.d[1]
; CHECK-NEXT: movk x9, #43691
; CHECK-NEXT: fmov x10, d0
; CHECK-NEXT: ldr q0, [x11, :lo12:.LCPI4_0]
; CHECK-NEXT: mul x10, x10, x9
; CHECK-NEXT: mul x8, x8, x9
; CHECK-NEXT: fmov d1, x10
; CHECK-NEXT: mov v1.d[1], x8
; CHECK-NEXT: cmhs v0.2d, v0.2d, v1.2d
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: movi d1, #0xffffffff00000000
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%urem = urem <2 x i64> %X, <i64 3, i64 1>
%cmp = icmp eq <2 x i64> %urem, <i64 0, i64 42>
ret <2 x i1> %cmp
}