1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 11:33:24 +02:00
llvm-mirror/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
Roman Lebedev 9e97862e1f [Codegen] TargetLowering::prepareUREMEqFold(): x u% C1 ==/!= C2 (PR35479)
Summary:
The current lowering is:
```
Name: (X % C1) == C2 -> X * C3 <= C4 || false
Pre: (C2 == 0 || C1 u<= C2) && (C1 u>> countTrailingZeros(C1)) * C3 == 1
%zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition
%o0 = urem i8 %x, C1
%r = icmp eq i8 %o0, C2
  =>
%zz = and i8 C3, 0 ; and silence it from complaining about said reg
%C4 = -1 /u C1
%n0 = mul i8 %x, C3
%n1 = lshr i8 %n0, countTrailingZeros(C1) ; rotate right
%n2 = shl i8 %n0, ((8-countTrailingZeros(C1)) %u 8) ; rotate right
%n3 = or i8 %n1, %n2 ; rotate right
%is_tautologically_false = icmp ule i8 C1, C2
%C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4
%res = icmp ule i8 %n3, %C4_fixed
%r = xor i1 %res, %is_tautologically_false
```
https://rise4fun.com/Alive/2xC
https://rise4fun.com/Alive/jpb5

However, we can support non-tautological cases `C1 u> C2` too.
Said handling consists of two parts:
* `C2 u<= (-1 %u C1)`. It just works. We only have to change `(X % C1) == C2` into `((X - C2) % C1) == 0`
```
Name: (X % C1) == C2 -> (X - C2) * C3 <= C4   iff C2 u<= (-1 %u C1)
Pre: (C1 u>> countTrailingZeros(C1)) * C3 == 1 && C2 u<= (-1 %u C1)
%zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition
%o0 = urem i8 %x, C1
%r = icmp eq i8 %o0, C2
  =>
%zz = and i8 C3, 0 ; and silence it from complaining about said reg
%C4 = (-1 /u C1)
%n0 = sub i8 %x, C2
%n1 = mul i8 %n0, C3
%n2 = lshr i8 %n1, countTrailingZeros(C1) ; rotate right
%n3 = shl i8 %n1, ((8-countTrailingZeros(C1)) %u 8) ; rotate right
%n4 = or i8 %n2, %n3 ; rotate right
%is_tautologically_false = icmp ule i8 C1, C2
%C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4
%res = icmp ule i8 %n4, %C4_fixed
%r = xor i1 %res, %is_tautologically_false
```
https://rise4fun.com/Alive/m4P
https://rise4fun.com/Alive/SKrx
* `C2 u> (-1 %u C1)`. We also have to change `(X % C1) == C2` into `((X - C2) % C1) == 0`,
  and we have to decrement C4:
```
Name: (X % C1) == C2 -> (X - C2) * C3 <= C4   iff C2 u> (-1 %u C1)
Pre: (C1 u>> countTrailingZeros(C1)) * C3 == 1 && C2 u> (-1 %u C1)
%zz = and i8 C3, 0 ; trick alive into making C3 avaliable in precondition
%o0 = urem i8 %x, C1
%r = icmp eq i8 %o0, C2
  =>
%zz = and i8 C3, 0 ; and silence it from complaining about said reg
%C4 = (-1 /u C1)-1
%n0 = sub i8 %x, C2
%n1 = mul i8 %n0, C3
%n2 = lshr i8 %n1, countTrailingZeros(C1) ; rotate right
%n3 = shl i8 %n1, ((8-countTrailingZeros(C1)) %u 8) ; rotate right
%n4 = or i8 %n2, %n3 ; rotate right
%is_tautologically_false = icmp ule i8 C1, C2
%C4_fixed = select i1 %is_tautologically_false, i8 -1, i8 %C4
%res = icmp ule i8 %n4, %C4_fixed
%r = xor i1 %res, %is_tautologically_false
```
https://rise4fun.com/Alive/d40
https://rise4fun.com/Alive/8cF

I believe this concludes `x u% C1 ==/!= C2` lowering.
In fact, clang is may now be better in this regard than gcc:
as it can be seen from `@t32_6_4` test, we do lower `x % 6 == 4`
via this pattern, while gcc does not: https://godbolt.org/z/XNU2z9
And all the general alive proofs say this is legal.
And manual checking agrees: https://rise4fun.com/Alive/WA2

Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=35479 | PR35479 ]].

Reviewers: RKSimon, craig.topper, spatel

Reviewed By: RKSimon

Subscribers: nick, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70053
2019-11-22 15:22:42 +03:00

116 lines
4.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_3:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x9, .LCPI0_1
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1]
; CHECK-NEXT: movk w8, #43690, lsl #16
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 3, i32 3, i32 3, i32 3>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 2>
ret <4 x i1> %cmp
}
define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_5:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: mov w9, #13106
; CHECK-NEXT: movk w9, #13107, lsl #16
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-NEXT: dup v1.4s, w9
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 5, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 1, i32 2, i32 3, i32 4>
ret <4 x i1> %cmp
}
define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_6_part0:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
; CHECK-NEXT: adrp x9, .LCPI2_0
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_0]
; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT: ushr v1.4s, v1.4s, #2
; CHECK-NEXT: movi v3.4s, #6
; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 3>
ret <4 x i1> %cmp
}
define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_6_part1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
; CHECK-NEXT: adrp x9, .LCPI3_0
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_0]
; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT: ushr v1.4s, v1.4s, #2
; CHECK-NEXT: movi v3.4s, #6
; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
%cmp = icmp eq <4 x i32> %urem, <i32 4, i32 5, i32 0, i32 0>
ret <4 x i1> %cmp
}
define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_tautological:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: adrp x8, .LCPI4_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
; CHECK-NEXT: adrp x8, .LCPI4_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
; CHECK-NEXT: adrp x8, .LCPI4_3
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3]
; CHECK-NEXT: adrp x8, .LCPI4_4
; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: neg v2.4s, v2.4s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s
; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI4_4]
; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 3>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 2>
ret <4 x i1> %cmp
}