1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00
llvm-mirror/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
Paul Walker 2467d01420 [SVE] Code generation for fixed length vector adds.
Summary:
Teach LowerToPredicatedOp to lower fixed length vector operations.

Add AArch64ISD nodes and isel patterns for predicated integer
and floating point adds.

Together this enables SVE code generation for fixed length vector adds.

Reviewers: rengolin, efriedma

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D82483
2020-06-26 19:54:41 +00:00

414 lines
18 KiB
LLVM

; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
; VBYTES represents the useful byte size of a vector register from the code
; generator's point of view. It is clamped to power-of-2 values because
; only power-of-2 vector lengths are considered legal, regardless of the
; user specified vector length.
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors.
define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-LABEL: @add_v8i8
; CHECK: add v0.8b, v0.8b, v1.8b
; CHECK: ret
%res = add <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; CHECK-LABEL: @add_v16i8
; CHECK: add v0.16b, v0.16b, v1.16b
; CHECK: ret
%res = add <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: @add_v32i8
; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; CHECK: st1b { [[RES]].b }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%res = add <32 x i8> %op1, %op2
store <32 x i8> %res, <32 x i8>* %a
ret void
}
define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: @add_v64i8
; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b
; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
; CHECK: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = add <64 x i8> %op1, %op2
store <64 x i8> %res, <64 x i8>* %a
ret void
}
define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
; CHECK-LABEL: @add_v128i8
; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b
; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b
; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b
; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
; CHECK: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%res = add <128 x i8> %op1, %op2
store <128 x i8> %res, <128 x i8>* %a
ret void
}
define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
; CHECK-LABEL: @add_v256i8
; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]]
; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]]
; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]]
; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b
; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]]
; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]]
; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]]
; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]]
; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b
; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]]
; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]]
; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]]
; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]]
; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b
; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]]
; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]]
; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]]
; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[PG]]/m, [[OP1_4]].b, [[OP2_4]].b
; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]]
; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]]
; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]]
; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[PG]]/m, [[OP1_5]].b, [[OP2_5]].b
; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]]
; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]]
; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]]
; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[PG]]/m, [[OP1_6]].b, [[OP2_6]].b
; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]]
; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]]
; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]]
; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]]
; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[PG]]/m, [[OP1_7]].b, [[OP2_7]].b
; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]]
; CHECK: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%res = add <256 x i8> %op1, %op2
store <256 x i8> %res, <256 x i8>* %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: @add_v4i16
; CHECK: add v0.4h, v0.4h, v1.4h
; CHECK: ret
%res = add <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; CHECK-LABEL: @add_v8i16
; CHECK: add v0.8h, v0.8h, v1.8h
; CHECK: ret
%res = add <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: @add_v16i16
; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%res = add <16 x i16> %op1, %op2
store <16 x i16> %res, <16 x i16>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: @add_v32i16
; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = add <32 x i16> %op1, %op2
store <32 x i16> %res, <32 x i16>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
; CHECK-LABEL: @add_v64i16
; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%res = add <64 x i16> %op1, %op2
store <64 x i16> %res, <64 x i16>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
; CHECK-LABEL: @add_v128i16
; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; CHECK: st1h { [[RES]].h }, [[PG]], [x0]
; CHECK: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%res = add <128 x i16> %op1, %op2
store <128 x i16> %res, <128 x i16>* %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-LABEL: @add_v2i32
; CHECK: add v0.2s, v0.2s, v1.2s
; CHECK: ret
%res = add <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
; CHECK-LABEL: @add_v4i32
; CHECK: add v0.4s, v0.4s, v1.4s
; CHECK: ret
%res = add <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; CHECK-LABEL: @add_v8i32
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%res = add <8 x i32> %op1, %op2
store <8 x i32> %res, <8 x i32>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: @add_v16i32
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%res = add <16 x i32> %op1, %op2
store <16 x i32> %res, <16 x i32>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
; CHECK-LABEL: @add_v32i32
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%res = add <32 x i32> %op1, %op2
store <32 x i32> %res, <32 x i32>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
; CHECK-LABEL: @add_v64i32
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; CHECK: st1w { [[RES]].s }, [[PG]], [x0]
; CHECK: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%res = add <64 x i32> %op1, %op2
store <64 x i32> %res, <64 x i32>* %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; CHECK-LABEL: @add_v1i64
; CHECK: add d0, d0, d1
; CHECK: ret
%res = add <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; CHECK-LABEL: @add_v2i64
; CHECK: add v0.2d, v0.2d, v1.2d
; CHECK: ret
%res = add <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
; CHECK-LABEL: @add_v4i64
; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%res = add <4 x i64> %op1, %op2
store <4 x i64> %res, <4 x i64>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
; CHECK-LABEL: @add_v8i64
; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%res = add <8 x i64> %op1, %op2
store <8 x i64> %res, <8 x i64>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
; CHECK-LABEL: @add_v16i64
; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%res = add <16 x i64> %op1, %op2
store <16 x i64> %res, <16 x i64>* %a
ret void
}
; NOTE: Check lines only cover the first VBYTES because the add_v#i8 tests
; already cover the general legalisation cases.
define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; CHECK-LABEL: @add_v32i64
; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; CHECK: st1d { [[RES]].d }, [[PG]], [x0]
; CHECK: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%res = add <32 x i64> %op1, %op2
store <32 x i64> %res, <32 x i64>* %a
ret void
}
attributes #0 = { "target-features"="+sve" }