From 6945b6292fcf1e9b25cba44378552b2f88ce39a1 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 22 Jul 2021 17:09:18 +0100 Subject: [PATCH] [SVE][NFC] Cleanup fixed length code gen tests to make them more resilient. Many of the tests have used NEXT when DAG is more approprite. In some cases single DAG lines have been used. Note that these are manual tests because they're to complex for update_llc_test_checks.py and so it's worth not relying too much on the ordered output. I've also made the CHECK lines more uniform when it comes to the ordering of things like LO/HI. --- .../sve-fixed-length-fp-extend-trunc.ll | 131 +++--- .../AArch64/sve-fixed-length-fp-reduce.ll | 26 +- .../AArch64/sve-fixed-length-fp-rounding.ll | 186 ++++----- .../AArch64/sve-fixed-length-fp-to-int.ll | 373 ++++++++--------- .../AArch64/sve-fixed-length-int-extends.ll | 16 +- .../AArch64/sve-fixed-length-int-to-fp.ll | 374 +++++++++--------- .../AArch64/sve-fixed-length-masked-gather.ll | 168 ++++---- .../sve-fixed-length-masked-scatter.ll | 166 ++++---- .../AArch64/sve-fixed-length-trunc-stores.ll | 147 +++---- .../sve-fixed-length-vector-shuffle.ll | 218 +++++----- 10 files changed, 917 insertions(+), 888 deletions(-) diff --git a/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll index 0bd6291eb9c..bb6335d9bf8 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -67,18 +67,18 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16 ; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_256-DAG: st1h { [[VEC:z[0-9]+]].h }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: fcvt [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h -; VBITS_EQ_256-NEXT: fcvt [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: fcvt [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h +; VBITS_EQ_256-DAG: fcvt [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x[[B_HI]]] %op1 = load <16 x half>, <16 x half>* %a %res = fpext <16 x half> %op1 to <16 x float> store <16 x float> %res, <16 x float>* %b @@ -166,18 +166,18 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ldr q[[OP:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ext v[[HI:[0-9]+]].16b, v[[OP]].16b, v[[OP]].16b, #8 -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[OP]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK2_HI]].s -; VBITS_EQ_256-NEXT: fcvt [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[UPK2_LO]].h -; VBITS_EQ_256-NEXT: fcvt [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[UPK2_HI]].h -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[OP]].16b, v[[OP]].16b, #8 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[OP]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK2_HI]].s +; VBITS_EQ_256-DAG: fcvt [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[UPK2_LO]].h +; VBITS_EQ_256-DAG: fcvt [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[UPK2_HI]].h +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[B_HI]]] %op1 = load <8 x half>, <8 x half>* %a %res = fpext <8 x half> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -263,18 +263,18 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 ; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_EQ_256-DAG: st1w { [[VEC:z[0-9]+]].s }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s -; VBITS_EQ_256-NEXT: fcvt [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s -; VBITS_EQ_256-NEXT: fcvt [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s +; VBITS_EQ_256-DAG: fcvt [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s +; VBITS_EQ_256-DAG: fcvt [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x[[B_HI]]] %op1 = load <8 x float>, <8 x float>* %a %res = fpext <8 x float> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -358,18 +358,18 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG1]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG1]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].h, vl8 -; VBITS_EQ_256-NEXT: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: fcvt [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8 +; VBITS_EQ_256-DAG: fcvt [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s +; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16 ; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG4]], [x1] %op1 = load <16 x float>, <16 x float>* %a %res = fptrunc <16 x float> %op1 to <16 x half> @@ -459,18 +459,18 @@ define <8 x half> @fcvt_v8f64_v8f16(<8 x double>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG1]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG1]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: fcvt [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h -; VBITS_EQ_256-NEXT: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v[[RES_LO]].d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: fcvt [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h +; VBITS_EQ_256-DAG: mov v0.d[1], v[[RES_HI]].d[0] %op1 = load <8 x double>, <8 x double>* %a %res = fptrunc <8 x double> %op1 to <8 x half> ret <8 x half> %res @@ -557,19 +557,20 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG1]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG1]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: fcvt [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: fcvt [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG4]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: fcvt [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8 +; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1] +; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptrunc <8 x double> %op1 to <8 x float> store <8 x float> %res, <8 x float>* %b diff --git a/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll index c89540fee79..e1e939346b2 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll @@ -63,10 +63,10 @@ define half @fadda_v32f16(half %start, <32 x half>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] ; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h ; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h ; VBITS_EQ_256-NEXT: ret @@ -136,10 +136,10 @@ define float @fadda_v16f32(float %start, <16 x float>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] ; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s ; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s ; VBITS_EQ_256-NEXT: ret @@ -209,10 +209,10 @@ define double @fadda_v8f64(double %start, <8 x double>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] ; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d ; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d ; VBITS_EQ_256-NEXT: ret @@ -370,9 +370,9 @@ define float @faddv_v16f32(float %start, <16 x float>* %a) #0 { ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: add x[[A_LO:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_LO]]] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[LO]].s, [[HI]].s ; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s ; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]] @@ -447,9 +447,9 @@ define double @faddv_v8f64(double %start, <8 x double>* %a) #0 { ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: add x[[A_LO:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_LO]]] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]] ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[LO]].d, [[HI]].d ; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d ; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]] diff --git a/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll index 1d7472707b0..342ae92063c 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll @@ -45,7 +45,7 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) #0 { define void @frintp_v16f16(<16 x half>* %a) #0 { ; CHECK-LABEL: frintp_v16f16: ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -58,7 +58,7 @@ define void @frintp_v16f16(<16 x half>* %a) #0 { define void @frintp_v32f16(<32 x half>* %a) #0 { ; CHECK-LABEL: frintp_v32f16: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -71,7 +71,7 @@ define void @frintp_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.ceil.v32f16(<32 x half> %op) @@ -82,7 +82,7 @@ define void @frintp_v32f16(<32 x half>* %a) #0 { define void @frintp_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: frintp_v64f16: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -95,7 +95,7 @@ define void @frintp_v64f16(<64 x half>* %a) #0 { define void @frintp_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: frintp_v128f16: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -126,7 +126,7 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) #0 { define void @frintp_v8f32(<8 x float>* %a) #0 { ; CHECK-LABEL: frintp_v8f32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -139,7 +139,7 @@ define void @frintp_v8f32(<8 x float>* %a) #0 { define void @frintp_v16f32(<16 x float>* %a) #0 { ; CHECK-LABEL: frintp_v16f32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -152,7 +152,7 @@ define void @frintp_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %op) @@ -163,7 +163,7 @@ define void @frintp_v16f32(<16 x float>* %a) #0 { define void @frintp_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: frintp_v32f32: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -176,7 +176,7 @@ define void @frintp_v32f32(<32 x float>* %a) #0 { define void @frintp_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: frintp_v64f32: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -207,7 +207,7 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) #0 { define void @frintp_v4f64(<4 x double>* %a) #0 { ; CHECK-LABEL: frintp_v4f64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -220,7 +220,7 @@ define void @frintp_v4f64(<4 x double>* %a) #0 { define void @frintp_v8f64(<8 x double>* %a) #0 { ; CHECK-LABEL: frintp_v8f64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -233,7 +233,7 @@ define void @frintp_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frintp [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frintp [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %op) @@ -244,7 +244,7 @@ define void @frintp_v8f64(<8 x double>* %a) #0 { define void @frintp_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: frintp_v16f64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -257,7 +257,7 @@ define void @frintp_v16f64(<16 x double>* %a) #0 { define void @frintp_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: frintp_v32f64: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintp [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -292,7 +292,7 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) #0 { define void @frintm_v16f16(<16 x half>* %a) #0 { ; CHECK-LABEL: frintm_v16f16: ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -305,7 +305,7 @@ define void @frintm_v16f16(<16 x half>* %a) #0 { define void @frintm_v32f16(<32 x half>* %a) #0 { ; CHECK-LABEL: frintm_v32f16: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -318,7 +318,7 @@ define void @frintm_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.floor.v32f16(<32 x half> %op) @@ -329,7 +329,7 @@ define void @frintm_v32f16(<32 x half>* %a) #0 { define void @frintm_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: frintm_v64f16: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -342,7 +342,7 @@ define void @frintm_v64f16(<64 x half>* %a) #0 { define void @frintm_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: frintm_v128f16: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -373,7 +373,7 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) #0 { define void @frintm_v8f32(<8 x float>* %a) #0 { ; CHECK-LABEL: frintm_v8f32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -386,7 +386,7 @@ define void @frintm_v8f32(<8 x float>* %a) #0 { define void @frintm_v16f32(<16 x float>* %a) #0 { ; CHECK-LABEL: frintm_v16f32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -399,7 +399,7 @@ define void @frintm_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %op) @@ -410,7 +410,7 @@ define void @frintm_v16f32(<16 x float>* %a) #0 { define void @frintm_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: frintm_v32f32: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -423,7 +423,7 @@ define void @frintm_v32f32(<32 x float>* %a) #0 { define void @frintm_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: frintm_v64f32: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -454,7 +454,7 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) #0 { define void @frintm_v4f64(<4 x double>* %a) #0 { ; CHECK-LABEL: frintm_v4f64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -467,7 +467,7 @@ define void @frintm_v4f64(<4 x double>* %a) #0 { define void @frintm_v8f64(<8 x double>* %a) #0 { ; CHECK-LABEL: frintm_v8f64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -480,7 +480,7 @@ define void @frintm_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frintm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frintm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.floor.v8f64(<8 x double> %op) @@ -491,7 +491,7 @@ define void @frintm_v8f64(<8 x double>* %a) #0 { define void @frintm_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: frintm_v16f64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -504,7 +504,7 @@ define void @frintm_v16f64(<16 x double>* %a) #0 { define void @frintm_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: frintm_v32f64: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -539,7 +539,7 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) #0 { define void @frinti_v16f16(<16 x half>* %a) #0 { ; CHECK-LABEL: frinti_v16f16: ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -552,7 +552,7 @@ define void @frinti_v16f16(<16 x half>* %a) #0 { define void @frinti_v32f16(<32 x half>* %a) #0 { ; CHECK-LABEL: frinti_v32f16: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -565,7 +565,7 @@ define void @frinti_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.nearbyint.v32f16(<32 x half> %op) @@ -576,7 +576,7 @@ define void @frinti_v32f16(<32 x half>* %a) #0 { define void @frinti_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: frinti_v64f16: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -589,7 +589,7 @@ define void @frinti_v64f16(<64 x half>* %a) #0 { define void @frinti_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: frinti_v128f16: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -620,7 +620,7 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) #0 { define void @frinti_v8f32(<8 x float>* %a) #0 { ; CHECK-LABEL: frinti_v8f32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -633,7 +633,7 @@ define void @frinti_v8f32(<8 x float>* %a) #0 { define void @frinti_v16f32(<16 x float>* %a) #0 { ; CHECK-LABEL: frinti_v16f32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -646,7 +646,7 @@ define void @frinti_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %op) @@ -657,7 +657,7 @@ define void @frinti_v16f32(<16 x float>* %a) #0 { define void @frinti_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: frinti_v32f32: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -670,7 +670,7 @@ define void @frinti_v32f32(<32 x float>* %a) #0 { define void @frinti_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: frinti_v64f32: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -701,7 +701,7 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) #0 { define void @frinti_v4f64(<4 x double>* %a) #0 { ; CHECK-LABEL: frinti_v4f64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -714,7 +714,7 @@ define void @frinti_v4f64(<4 x double>* %a) #0 { define void @frinti_v8f64(<8 x double>* %a) #0 { ; CHECK-LABEL: frinti_v8f64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -727,7 +727,7 @@ define void @frinti_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frinti [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frinti [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %op) @@ -738,7 +738,7 @@ define void @frinti_v8f64(<8 x double>* %a) #0 { define void @frinti_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: frinti_v16f64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -751,7 +751,7 @@ define void @frinti_v16f64(<16 x double>* %a) #0 { define void @frinti_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: frinti_v32f64: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frinti [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -786,7 +786,7 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) #0 { define void @frintx_v16f16(<16 x half>* %a) #0 { ; CHECK-LABEL: frintx_v16f16: ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -799,7 +799,7 @@ define void @frintx_v16f16(<16 x half>* %a) #0 { define void @frintx_v32f16(<32 x half>* %a) #0 { ; CHECK-LABEL: frintx_v32f16: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -812,7 +812,7 @@ define void @frintx_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.rint.v32f16(<32 x half> %op) @@ -823,7 +823,7 @@ define void @frintx_v32f16(<32 x half>* %a) #0 { define void @frintx_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: frintx_v64f16: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -836,7 +836,7 @@ define void @frintx_v64f16(<64 x half>* %a) #0 { define void @frintx_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: frintx_v128f16: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -867,7 +867,7 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) #0 { define void @frintx_v8f32(<8 x float>* %a) #0 { ; CHECK-LABEL: frintx_v8f32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -880,7 +880,7 @@ define void @frintx_v8f32(<8 x float>* %a) #0 { define void @frintx_v16f32(<16 x float>* %a) #0 { ; CHECK-LABEL: frintx_v16f32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -893,7 +893,7 @@ define void @frintx_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.rint.v16f32(<16 x float> %op) @@ -904,7 +904,7 @@ define void @frintx_v16f32(<16 x float>* %a) #0 { define void @frintx_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: frintx_v32f32: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -917,7 +917,7 @@ define void @frintx_v32f32(<32 x float>* %a) #0 { define void @frintx_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: frintx_v64f32: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -948,7 +948,7 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) #0 { define void @frintx_v4f64(<4 x double>* %a) #0 { ; CHECK-LABEL: frintx_v4f64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -961,7 +961,7 @@ define void @frintx_v4f64(<4 x double>* %a) #0 { define void @frintx_v8f64(<8 x double>* %a) #0 { ; CHECK-LABEL: frintx_v8f64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -974,7 +974,7 @@ define void @frintx_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frintx [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frintx [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.rint.v8f64(<8 x double> %op) @@ -985,7 +985,7 @@ define void @frintx_v8f64(<8 x double>* %a) #0 { define void @frintx_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: frintx_v16f64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -998,7 +998,7 @@ define void @frintx_v16f64(<16 x double>* %a) #0 { define void @frintx_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: frintx_v32f64: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintx [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -1033,7 +1033,7 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) #0 { define void @frinta_v16f16(<16 x half>* %a) #0 { ; CHECK-LABEL: frinta_v16f16: ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -1046,7 +1046,7 @@ define void @frinta_v16f16(<16 x half>* %a) #0 { define void @frinta_v32f16(<32 x half>* %a) #0 { ; CHECK-LABEL: frinta_v32f16: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -1059,7 +1059,7 @@ define void @frinta_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.round.v32f16(<32 x half> %op) @@ -1070,7 +1070,7 @@ define void @frinta_v32f16(<32 x half>* %a) #0 { define void @frinta_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: frinta_v64f16: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -1083,7 +1083,7 @@ define void @frinta_v64f16(<64 x half>* %a) #0 { define void @frinta_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: frinta_v128f16: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -1114,7 +1114,7 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) #0 { define void @frinta_v8f32(<8 x float>* %a) #0 { ; CHECK-LABEL: frinta_v8f32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -1127,7 +1127,7 @@ define void @frinta_v8f32(<8 x float>* %a) #0 { define void @frinta_v16f32(<16 x float>* %a) #0 { ; CHECK-LABEL: frinta_v16f32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -1140,7 +1140,7 @@ define void @frinta_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.round.v16f32(<16 x float> %op) @@ -1151,7 +1151,7 @@ define void @frinta_v16f32(<16 x float>* %a) #0 { define void @frinta_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: frinta_v32f32: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -1164,7 +1164,7 @@ define void @frinta_v32f32(<32 x float>* %a) #0 { define void @frinta_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: frinta_v64f32: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -1195,7 +1195,7 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) #0 { define void @frinta_v4f64(<4 x double>* %a) #0 { ; CHECK-LABEL: frinta_v4f64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -1208,7 +1208,7 @@ define void @frinta_v4f64(<4 x double>* %a) #0 { define void @frinta_v8f64(<8 x double>* %a) #0 { ; CHECK-LABEL: frinta_v8f64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -1221,7 +1221,7 @@ define void @frinta_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frinta [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frinta [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.round.v8f64(<8 x double> %op) @@ -1232,7 +1232,7 @@ define void @frinta_v8f64(<8 x double>* %a) #0 { define void @frinta_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: frinta_v16f64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -1245,7 +1245,7 @@ define void @frinta_v16f64(<16 x double>* %a) #0 { define void @frinta_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: frinta_v32f64: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frinta [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -1306,7 +1306,7 @@ define void @frintn_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.roundeven.v32f16(<32 x half> %op) @@ -1387,7 +1387,7 @@ define void @frintn_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %op) @@ -1468,7 +1468,7 @@ define void @frintn_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frintn [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frintn [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %op) @@ -1527,7 +1527,7 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) #0 { define void @frintz_v16f16(<16 x half>* %a) #0 { ; CHECK-LABEL: frintz_v16f16: ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -1540,7 +1540,7 @@ define void @frintz_v16f16(<16 x half>* %a) #0 { define void @frintz_v32f16(<32 x half>* %a) #0 { ; CHECK-LABEL: frintz_v32f16: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -1553,7 +1553,7 @@ define void @frintz_v32f16(<32 x half>* %a) #0 { ; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <32 x half>, <32 x half>* %a %res = call <32 x half> @llvm.trunc.v32f16(<32 x half> %op) @@ -1564,7 +1564,7 @@ define void @frintz_v32f16(<32 x half>* %a) #0 { define void @frintz_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: frintz_v64f16: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -1577,7 +1577,7 @@ define void @frintz_v64f16(<64 x half>* %a) #0 { define void @frintz_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: frintz_v128f16: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-DAG: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -1608,7 +1608,7 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) #0 { define void @frintz_v8f32(<8 x float>* %a) #0 { ; CHECK-LABEL: frintz_v8f32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -1621,7 +1621,7 @@ define void @frintz_v8f32(<8 x float>* %a) #0 { define void @frintz_v16f32(<16 x float>* %a) #0 { ; CHECK-LABEL: frintz_v16f32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -1634,7 +1634,7 @@ define void @frintz_v16f32(<16 x float>* %a) #0 { ; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <16 x float>, <16 x float>* %a %res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %op) @@ -1645,7 +1645,7 @@ define void @frintz_v16f32(<16 x float>* %a) #0 { define void @frintz_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: frintz_v32f32: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -1658,7 +1658,7 @@ define void @frintz_v32f32(<32 x float>* %a) #0 { define void @frintz_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: frintz_v64f32: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-DAG: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -1689,7 +1689,7 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) #0 { define void @frintz_v4f64(<4 x double>* %a) #0 { ; CHECK-LABEL: frintz_v4f64: ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: ret @@ -1702,7 +1702,7 @@ define void @frintz_v4f64(<4 x double>* %a) #0 { define void @frintz_v8f64(<8 x double>* %a) #0 { ; CHECK-LABEL: frintz_v8f64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret @@ -1715,7 +1715,7 @@ define void @frintz_v8f64(<8 x double>* %a) #0 { ; VBITS_EQ_256-DAG: frintz [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_EQ_256-DAG: frintz [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op = load <8 x double>, <8 x double>* %a %res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %op) @@ -1726,7 +1726,7 @@ define void @frintz_v8f64(<8 x double>* %a) #0 { define void @frintz_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: frintz_v16f64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -1739,7 +1739,7 @@ define void @frintz_v16f64(<16 x double>* %a) #0 { define void @frintz_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: frintz_v32f64: ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-DAG: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048-NEXT: frintz [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret diff --git a/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll index 16880c7c493..e35e71bb9e9 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -68,15 +68,14 @@ define void @fcvtzu_v32f16_v32i16(<32 x half>* %a, <32 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: fcvtzu [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h -; VBITS_EQ_256-NEXT: fcvtzu [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h -; VBITS_EQ_256-NEXT: st1h { [[RES_LO]].h }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1h { [[RES_HI]].h }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h +; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h +; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]] ; VBITS_EQ_256-NEXT: ret %op1 = load <32 x half>, <32 x half>* %a %res = fptoui <32 x half> %op1 to <32 x i16> @@ -160,16 +159,17 @@ define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 { ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 ; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1h { [[VEC:z[0-9]+]].h }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h -; VBITS_EQ_256-NEXT: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h +; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x8] %op1 = load <16 x half>, <16 x half>* %a %res = fptoui <16 x half> %op1 to <16 x i32> store <16 x i32> %res, <16 x i32>* %b @@ -258,18 +258,18 @@ define void @fcvtzu_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ldr q[[OP:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s -; VBITS_EQ_256-NEXT: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h -; VBITS_EQ_256-NEXT: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] +; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h +; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x half>, <8 x half>* %a %res = fptoui <8 x half> %op1 to <8 x i64> @@ -362,18 +362,18 @@ define void @fcvtzu_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].h, vl8 -; VBITS_EQ_256-NEXT: fcvtzu [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: fcvtzu [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8 +; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s +; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16 ; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a @@ -458,15 +458,15 @@ define void @fcvtzu_v16f32_v16i32(<16 x float>* %a, <16 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s +; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %res = fptoui <16 x float> %op1 to <16 x i32> @@ -549,18 +549,19 @@ define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 ; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1w { [[VEC:z[0-9]+]].s }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s -; VBITS_EQ_256-NEXT: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s -; VBITS_EQ_256-NEXT: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s +; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s +; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] %op1 = load <8 x float>, <8 x float>* %a %res = fptoui <8 x float> %op1 to <8 x i64> store <8 x i64> %res, <8 x i64>* %b @@ -649,18 +650,18 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(<8 x double>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h -; VBITS_EQ_256-NEXT: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v[[RES_LO]].d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h +; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptoui <8 x double> %op1 to <8 x i16> @@ -750,18 +751,18 @@ define void @fcvtzu_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: fcvtzu [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a @@ -847,15 +848,15 @@ define void @fcvtzu_v8f64_v8i64(<8 x double>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: fcvtzu [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d +; VBITS_EQ_256-DAG: fcvtzu [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptoui <8 x double> %op1 to <8 x i64> @@ -937,15 +938,15 @@ define void @fcvtzs_v32f16_v32i16(<32 x half>* %a, <32 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: fcvtzs [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h -; VBITS_EQ_256-NEXT: fcvtzs [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h -; VBITS_EQ_256-NEXT: st1h { [[RES_LO]].h }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1h { [[RES_HI]].h }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h +; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h +; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <32 x half>, <32 x half>* %a %res = fptosi <32 x half> %op1 to <32 x i16> @@ -1027,18 +1028,19 @@ define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16 ; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1h { [[VEC:z[0-9]+]].h }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h -; VBITS_EQ_256-NEXT: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h +; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x8] %op1 = load <16 x half>, <16 x half>* %a %res = fptosi <16 x half> %op1 to <16 x i32> store <16 x i32> %res, <16 x i32>* %b @@ -1127,18 +1129,18 @@ define void @fcvtzs_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ldr q[[OP:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s -; VBITS_EQ_256-NEXT: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h -; VBITS_EQ_256-NEXT: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] +; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].h +; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].h +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x half>, <8 x half>* %a %res = fptosi <8 x half> %op1 to <8 x i64> @@ -1231,18 +1233,18 @@ define void @fcvtzs_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].h, vl8 -; VBITS_EQ_256-NEXT: fcvtzs [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: fcvtzs [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8 +; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].s +; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16 ; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a @@ -1327,15 +1329,15 @@ define void @fcvtzs_v16f32_v16i32(<16 x float>* %a, <16 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s +; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %res = fptosi <16 x float> %op1 to <16 x i32> @@ -1418,18 +1420,19 @@ define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 ; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1w { [[VEC:z[0-9]+]].s }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s -; VBITS_EQ_256-NEXT: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s -; VBITS_EQ_256-NEXT: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s +; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s +; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] %op1 = load <8 x float>, <8 x float>* %a %res = fptosi <8 x float> %op1 to <8 x i64> store <8 x i64> %res, <8 x i64>* %b @@ -1518,18 +1521,18 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(<8 x double>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h -; VBITS_EQ_256-NEXT: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v[[RES_LO]].d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h +; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptosi <8 x double> %op1 to <8 x i16> @@ -1619,18 +1622,18 @@ define void @fcvtzs_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: fcvtzs [[CVT_HI:z[0-9]+]].d, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a @@ -1716,15 +1719,15 @@ define void @fcvtzs_v8f64_v8i64(<8 x double>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: fcvtzs [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d +; VBITS_EQ_256-DAG: fcvtzs [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptosi <8 x double> %op1 to <8 x i64> diff --git a/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index 3e009391c3a..28856aadafe 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -139,13 +139,13 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 { define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 { ; CHECK-LABEL: sext_v16i8_v16i32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b +; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b ; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-DAG: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b ; VBITS_EQ_256-DAG: sunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b ; VBITS_EQ_256-DAG: sunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h @@ -213,7 +213,7 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 { define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 { ; CHECK-LABEL: sext_v8i8_v8i64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b +; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b ; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] @@ -226,7 +226,7 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 { define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 { ; CHECK-LABEL: sext_v16i8_v16i64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b +; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, z0.b ; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] @@ -501,13 +501,13 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, <8 x i32>* %out) #0 { define void @zext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 { ; CHECK-LABEL: zext_v16i8_v16i32: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b +; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b ; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-DAG: ext v[[A_HI:[0-9]+]].16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_LO:z[0-9]+]].h, z0.b ; VBITS_EQ_256-DAG: uunpklo [[A_HALFS_HI:z[0-9]+]].h, z[[A_HI]].b ; VBITS_EQ_256-DAG: uunpklo [[A_WORDS_LO:z[0-9]+]].s, [[A_HALFS_LO]].h @@ -575,7 +575,7 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, <4 x i64>* %out) #0 { define void @zext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 { ; CHECK-LABEL: zext_v8i8_v8i64: ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b +; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b ; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] @@ -588,7 +588,7 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, <8 x i64>* %out) #0 { define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 { ; CHECK-LABEL: zext_v16i8_v16i64: ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b +; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, z0.b ; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x0] diff --git a/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 0a613156f7d..d266d9be28e 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -68,15 +68,15 @@ define void @ucvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ucvtf [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h -; VBITS_EQ_256-NEXT: ucvtf [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h -; VBITS_EQ_256-NEXT: st1h { [[RES_LO]].h }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1h { [[RES_HI]].h }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ucvtf [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h +; VBITS_EQ_256-DAG: ucvtf [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h +; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %res = uitofp <32 x i16> %op1 to <32 x half> @@ -159,18 +159,19 @@ define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16 ; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1h { [[VEC:z[0-9]+]].h }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: ucvtf [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].s -; VBITS_EQ_256-NEXT: ucvtf [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: ucvtf [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].s +; VBITS_EQ_256-DAG: ucvtf [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x8] %op1 = load <16 x i16>, <16 x i16>* %a %res = uitofp <16 x i16> %op1 to <16 x float> store <16 x float> %res, <16 x float>* %b @@ -261,18 +262,18 @@ define void @ucvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ldr q[[OP:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s -; VBITS_EQ_256-NEXT: ucvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].d -; VBITS_EQ_256-NEXT: ucvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] +; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: ucvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].d +; VBITS_EQ_256-DAG: ucvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a %res = uitofp <8 x i16> %op1 to <8 x double> @@ -361,18 +362,18 @@ define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].h, vl8 -; VBITS_EQ_256-NEXT: ucvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: ucvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8 +; VBITS_EQ_256-DAG: ucvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s +; VBITS_EQ_256-DAG: ucvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16 ; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -457,15 +458,15 @@ define void @ucvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ucvtf [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: ucvtf [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ucvtf [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s +; VBITS_EQ_256-DAG: ucvtf [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %res = uitofp <16 x i32> %op1 to <16 x float> @@ -548,18 +549,19 @@ define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 ; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1w { [[VEC:z[0-9]+]].s }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s -; VBITS_EQ_256-NEXT: ucvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].d -; VBITS_EQ_256-NEXT: ucvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s +; VBITS_EQ_256-DAG: ucvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].d +; VBITS_EQ_256-DAG: ucvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] %op1 = load <8 x i32>, <8 x i32>* %a %res = uitofp <8 x i32> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -648,18 +650,18 @@ define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: ucvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: ucvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h -; VBITS_EQ_256-NEXT: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v[[RES_LO]].d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: ucvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: ucvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h +; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = uitofp <8 x i64> %op1 to <8 x half> @@ -749,18 +751,18 @@ define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: ucvtf [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: ucvtf [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: ucvtf [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: ucvtf [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -846,15 +848,15 @@ define void @ucvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ucvtf [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: ucvtf [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ucvtf [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d +; VBITS_EQ_256-DAG: ucvtf [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = uitofp <8 x i64> %op1 to <8 x double> @@ -936,15 +938,15 @@ define void @scvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: scvtf [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h -; VBITS_EQ_256-NEXT: scvtf [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h -; VBITS_EQ_256-NEXT: st1h { [[RES_LO]].h }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1h { [[RES_HI]].h }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: scvtf [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[LO]].h +; VBITS_EQ_256-DAG: scvtf [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[HI]].h +; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %res = sitofp <32 x i16> %op1 to <32 x half> @@ -1027,18 +1029,19 @@ define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16 ; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1h { [[VEC:z[0-9]+]].h }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: sunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: sunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: scvtf [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].s -; VBITS_EQ_256-NEXT: scvtf [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: sunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: sunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: scvtf [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].s +; VBITS_EQ_256-DAG: scvtf [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x8] %op1 = load <16 x i16>, <16 x i16>* %a %res = sitofp <16 x i16> %op1 to <16 x float> store <16 x float> %res, <16 x float>* %b @@ -1129,18 +1132,18 @@ define void @scvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ldr q[[OP:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 -; VBITS_EQ_256-NEXT: sunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h -; VBITS_EQ_256-NEXT: sunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h -; VBITS_EQ_256-NEXT: sunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-NEXT: sunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s -; VBITS_EQ_256-NEXT: scvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].d -; VBITS_EQ_256-NEXT: scvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] +; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[LO:[0-9]+]].16b, v[[OP]].16b, #8 +; VBITS_EQ_256-DAG: sunpklo [[UPK1_LO:z[0-9]+]].s, z[[LO]].h +; VBITS_EQ_256-DAG: sunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h +; VBITS_EQ_256-DAG: sunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: sunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: scvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK2_LO]].d +; VBITS_EQ_256-DAG: scvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK2_HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i16>, <8 x i16>* %a %res = sitofp <8 x i16> %op1 to <8 x double> @@ -1229,18 +1232,18 @@ define void @scvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].h, vl8 -; VBITS_EQ_256-NEXT: scvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: scvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8 +; VBITS_EQ_256-DAG: scvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s +; VBITS_EQ_256-DAG: scvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16 ; VBITS_EQ_256-NEXT: st1h { [[RES]].h }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -1325,15 +1328,15 @@ define void @scvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: scvtf [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s -; VBITS_EQ_256-NEXT: scvtf [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES_LO]].s }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1w { [[RES_HI]].s }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: scvtf [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[LO]].s +; VBITS_EQ_256-DAG: scvtf [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %res = sitofp <16 x i32> %op1 to <16 x float> @@ -1416,18 +1419,19 @@ define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently. -; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 ; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: mov x8, sp ; VBITS_EQ_256-DAG: st1w { [[VEC:z[0-9]+]].s }, [[PG1]], [x8] ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: sunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s -; VBITS_EQ_256-NEXT: sunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s -; VBITS_EQ_256-NEXT: scvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].d -; VBITS_EQ_256-NEXT: scvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG2]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: sunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s +; VBITS_EQ_256-DAG: sunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s +; VBITS_EQ_256-DAG: scvtf [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].d +; VBITS_EQ_256-DAG: scvtf [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x8] %op1 = load <8 x i32>, <8 x i32>* %a %res = sitofp <8 x i32> %op1 to <8 x double> store <8 x double> %res, <8 x double>* %b @@ -1516,17 +1520,17 @@ define <8 x half> @scvtf_v8i64_v8f16(<8 x i64>* %a) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: scvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: scvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h -; VBITS_EQ_256-NEXT: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: scvtf [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: scvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h ; VBITS_EQ_256-NEXT: mov v[[RES_LO]].d[1], v[[RES_HI]].d[0] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -1617,18 +1621,18 @@ define void @scvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].d -; VBITS_EQ_256-NEXT: ptrue [[PG3:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: scvtf [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: scvtf [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s -; VBITS_EQ_256-NEXT: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: ptrue [[PG4:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: scvtf [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d +; VBITS_EQ_256-DAG: scvtf [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d +; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -1714,15 +1718,15 @@ define void @scvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: scvtf [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d -; VBITS_EQ_256-NEXT: scvtf [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG]], [x1] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: scvtf [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[LO]].d +; VBITS_EQ_256-DAG: scvtf [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[HI]].d +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x8] ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = sitofp <8 x i64> %op1 to <8 x double> diff --git a/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 5ee33342460..c3903e0253e 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -70,31 +70,6 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8i8: -; VBITS_EQ_256: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-NEXT: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] -; VBITS_EQ_256-NEXT: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-NEXT: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-NEXT: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 -; VBITS_EQ_256-NEXT: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 -; VBITS_EQ_256-NEXT: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 -; VBITS_EQ_256-NEXT: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 -; VBITS_EQ_256-NEXT: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 -; VBITS_EQ_256-NEXT: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 -; VBITS_EQ_256-NEXT: ld1sb { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: ld1sb { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-NEXT: uzp1 [[UZP1_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: uzp1 [[UZP1_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s -; VBITS_EQ_256-NEXT: uzp1 z[[UZP2_HI:[0-9]+]].h, [[UZP1_HI]].h, [[UZP1_HI]].h -; VBITS_EQ_256-NEXT: uzp1 z[[UZP2_LO:[0-9]+]].h, [[UZP1_LO]].h, [[UZP1_LO]].h -; VBITS_EQ_256-NEXT: uzp1 v[[UZP3:[0-9]+]].8b, v[[UZP2_LO]].8b, v[[UZP2_HI]].8b -; VBITS_EQ_256-NEXT: str d[[UZP3]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ldr d[[VALS:[0-9]+]], [x0] ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] @@ -107,6 +82,32 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; VBITS_GE_512-NEXT: uzp1 z[[UZP3:[0-9]+]].b, [[UZP2]].b, [[UZP2]].b ; VBITS_GE_512-NEXT: str d[[UZP3]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ldr d[[VALS:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] +; VBITS_EQ_256-DAG: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 +; VBITS_EQ_256-DAG: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 +; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 +; VBITS_EQ_256-DAG: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 +; VBITS_EQ_256-DAG: ld1sb { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: ld1sb { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-DAG: uzp1 [[UZP1_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[UZP1_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: uzp1 z[[UZP2_LO:[0-9]+]].h, [[UZP1_LO]].h, [[UZP1_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[UZP2_HI:[0-9]+]].h, [[UZP1_HI]].h, [[UZP1_HI]].h +; VBITS_EQ_256-NEXT: uzp1 v[[UZP3:[0-9]+]].8b, v[[UZP2_LO]].8b, v[[UZP2_HI]].8b +; VBITS_EQ_256-NEXT: str d[[UZP3]], [x0] +; VBITS_EQ_256-NEXT: ret %cval = load <8 x i8>, <8 x i8>* %a %ptrs = load <8 x i8*>, <8 x i8*>* %b %mask = icmp eq <8 x i8> %cval, zeroinitializer @@ -207,26 +208,6 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8i16: -; VBITS_EQ_256: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 -; VBITS_EQ_256-NEXT: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 -; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 -; VBITS_EQ_256-DAG: ld1h { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-DAG: ld1h { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: uzp1 [[UZP1_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s -; VBITS_EQ_256-NEXT: uzp1 z[[UZP2_LO:[0-9]+]].h, [[UZP1_LO]].h, [[UZP1_LO]].h -; VBITS_EQ_256-NEXT: uzp1 [[UZP1_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: uzp1 z[[UZP2_HI:[0-9]+]].h, [[UZP1_HI]].h, [[UZP1_HI]].h -; VBITS_EQ_256-NEXT: mov v[[UZP2_LO]].d[1], v[[UZP2_HI]].d[0] -; VBITS_EQ_256-NEXT: str q[[UZP2_LO]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] @@ -238,6 +219,27 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; VBITS_GE_512-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h ; VBITS_GE_512-NEXT: str q[[UZP2]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ldr q[[VALS:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 +; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 +; VBITS_EQ_256-DAG: ld1h { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: ld1h { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-DAG: uzp1 [[UZP1_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s +; VBITS_EQ_256-DAG: uzp1 z[[UZP2_LO:[0-9]+]].h, [[UZP1_LO]].h, [[UZP1_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[UZP1_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: uzp1 z[[UZP2_HI:[0-9]+]].h, [[UZP1_HI]].h, [[UZP1_HI]].h +; VBITS_EQ_256-NEXT: mov v[[UZP2_LO]].d[1], v[[UZP2_HI]].d[0] +; VBITS_EQ_256-NEXT: str q[[UZP2_LO]], [x0] +; VBITS_EQ_256-NEXT: ret %cval = load <8 x i16>, <8 x i16>* %a %ptrs = load <8 x i16*>, <8 x i16*>* %b %mask = icmp eq <8 x i16> %cval, zeroinitializer @@ -331,28 +333,6 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8i32: -; VBITS_EQ_256: ptrue [[PG0:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG1]]/z, [x8] -; VBITS_EQ_256-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_EQ_256-NEXT: mov x8, sp -; VBITS_EQ_256-NEXT: mov [[MONE:z[0-9]+]].s, p1/z, #-1 -; VBITS_EQ_256-NEXT: st1w { [[MONE]].s }, [[PG0]], [x8] -; VBITS_EQ_256-NEXT: ldr q[[CMP_HI:[0-9]+]], [sp, #16] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 -; VBITS_EQ_256-NEXT: ld1w { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: ldr q[[CMP_LO:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: uzp1 [[UZP_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 -; VBITS_EQ_256-NEXT: ld1w { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-NEXT: uzp1 [[UZP_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s -; VBITS_EQ_256-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], [[RES_LO]].s, [[RES_HI]].s -; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG0]], [x0] - ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8 ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 @@ -362,6 +342,29 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG1]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG1]]/z, [x8] +; VBITS_EQ_256-DAG: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_EQ_256-DAG: mov x8, sp +; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, p1/z, #-1 +; VBITS_EQ_256-DAG: st1w { [[MONE]].s }, [[PG0]], [x8] +; VBITS_EQ_256-DAG: ldr q[[CMP_HI:[0-9]+]], [sp, #16] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 +; VBITS_EQ_256-DAG: ld1w { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-DAG: ldr q[[CMP_LO:[0-9]+]], [sp] +; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 +; VBITS_EQ_256-DAG: ld1w { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s +; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG1]], [[RES_LO]].s, [[RES_HI]].s +; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG0]], [x0] %cval = load <8 x i32>, <8 x i32>* %a %ptrs = load <8 x i32*>, <8 x i32*>* %b %mask = icmp eq <8 x i32> %cval, zeroinitializer @@ -461,21 +464,6 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 { define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8i64: -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1d { [[VALS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[VALS_LO:z[0-9]+]].d }, [[PG0]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x9] -; VBITS_EQ_256-NEXT: cmpeq [[MASK_HI:p[0-9]+]].d, [[PG0]]/z, [[VALS_HI]].d, #0 -; VBITS_EQ_256-NEXT: cmpeq [[MASK_LO:p[0-9]+]].d, [[PG0]]/z, [[VALS_LO]].d, #0 -; VBITS_EQ_256-NEXT: ld1d { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: ld1d { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-NEXT: st1d { [[RES_HI]].d }, [[PG0]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[RES_LO]].d }, [[PG0]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] @@ -483,6 +471,22 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; VBITS_GE_512-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG0]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[VALS_LO:z[0-9]+]].d }, [[PG0]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[VALS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x9] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_EQ_256-DAG: cmpeq [[MASK_LO:p[0-9]+]].d, [[PG0]]/z, [[VALS_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpeq [[MASK_HI:p[0-9]+]].d, [[PG0]]/z, [[VALS_HI]].d, #0 +; VBITS_EQ_256-DAG: ld1d { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: ld1d { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG0]], [x0] +; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG0]], [x8] +; VBITS_EQ_256-NEXT: ret %cval = load <8 x i64>, <8 x i64>* %a %ptrs = load <8 x i64*>, <8 x i64*>* %b %mask = icmp eq <8 x i64> %cval, zeroinitializer diff --git a/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 5dc40e399d0..e8fa2e690bc 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -66,31 +66,6 @@ define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i8: -; VBITS_EQ_256: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 -; VBITS_EQ_256-NEXT: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_EQ_256-NEXT: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-NEXT: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-NEXT: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 -; VBITS_EQ_256-NEXT: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-NEXT: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 -; VBITS_EQ_256-NEXT: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 -; VBITS_EQ_256-NEXT: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 -; VBITS_EQ_256-NEXT: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 -; VBITS_EQ_256-NEXT: zip1 v[[VALS2_LO:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b -; VBITS_EQ_256-NEXT: zip2 v[[VALS2_HI:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS2_LO]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[VALS2_HI]].h -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-NEXT: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s -; VBITS_EQ_256-NEXT: st1b { [[UPK2_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-NEXT: st1b { [[UPK2_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ldr d[[VALS:[0-9]+]], [x0] ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] @@ -102,6 +77,32 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; VBITS_GE_512-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s ; VBITS_GE_512-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ldr d[[VALS:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 +; VBITS_EQ_256-DAG: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 +; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] +; VBITS_EQ_256-DAG: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 +; VBITS_EQ_256-DAG: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 +; VBITS_EQ_256-DAG: zip1 v[[VALS2_LO:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: zip2 v[[VALS2_HI:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS2_LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[VALS2_HI]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: st1b { [[UPK2_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: st1b { [[UPK2_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-NEXT: ret %vals = load <8 x i8>, <8 x i8>* %a %ptrs = load <8 x i8*>, <8 x i8*>* %b %mask = icmp eq <8 x i8> %vals, zeroinitializer @@ -193,25 +194,6 @@ define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i16: -; VBITS_EQ_256: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 -; VBITS_EQ_256-NEXT: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_EQ_256-DAG: ext v[[EXT:[0-9]+]].16b, v[[VALS]].16b, v[[VALS]].16b, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 -; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 -; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS]].h -; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[EXT]].h -; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s -; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s -; VBITS_EQ_256-DAG: st1h { [[UPK2_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-DAG: st1h { [[UPK2_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] @@ -222,6 +204,26 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s ; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ldr q[[VALS:[0-9]+]], [x0] +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x[[B_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 +; VBITS_EQ_256-DAG: ext v[[EXT:[0-9]+]].16b, v[[VALS]].16b, v[[VALS]].16b, #8 +; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[EXT]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: st1h { [[UPK2_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: st1h { [[UPK2_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-NEXT: ret %vals = load <8 x i16>, <8 x i16>* %a %ptrs = load <8 x i16*>, <8 x i16*>* %b %mask = icmp eq <8 x i16> %vals, zeroinitializer @@ -306,30 +308,6 @@ define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i32: -; VBITS_EQ_256: ptrue [[PG0:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] -; VBITS_EQ_256-NEXT: add x8, x1, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG1]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_EQ_256-NEXT: add x8, sp, #32 -; VBITS_EQ_256-NEXT: mov x9, sp -; VBITS_EQ_256-NEXT: mov [[MONE:z[0-9]+]].s, p1/z, #-1 -; VBITS_EQ_256-NEXT: st1w { [[MONE]].s }, [[PG0]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[VALS]].s }, [[PG0]], [x9] -; VBITS_EQ_256-NEXT: ldr q[[CMP_LO:[0-9]+]], [sp, #32] -; VBITS_EQ_256-NEXT: ldr q[[VAL_LO:[0-9]+]], [sp] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl4 -; VBITS_EQ_256-NEXT: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].d, z[[VAL_LO]].s -; VBITS_EQ_256-NEXT: st1w { [[UPK1_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-NEXT: ldr q[[CMP_HI:[0-9]+]], [sp, #48] -; VBITS_EQ_256-NEXT: ldr q[[VAL_HI:[0-9]+]], [sp, #16] -; VBITS_EQ_256-NEXT: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 -; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].d, z[[VAL_HI]].s -; VBITS_EQ_256-NEXT: st1w { [[UPK1_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] - ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8 ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 @@ -338,6 +316,31 @@ define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG1]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG1]]/z, [x8] +; VBITS_EQ_256-DAG: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_EQ_256-DAG: add x8, sp, #32 +; VBITS_EQ_256-DAG: mov x9, sp +; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, p1/z, #-1 +; VBITS_EQ_256-DAG: st1w { [[MONE]].s }, [[PG0]], [x8] +; VBITS_EQ_256-DAG: st1w { [[VALS]].s }, [[PG0]], [x9] +; VBITS_EQ_256-DAG: ldr q[[CMP_LO:[0-9]+]], [sp, #32] +; VBITS_EQ_256-DAG: ldr q[[VAL_LO:[0-9]+]], [sp] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].d, z[[VAL_LO]].s +; VBITS_EQ_256-DAG: st1w { [[UPK1_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: ldr q[[CMP_HI:[0-9]+]], [sp, #48] +; VBITS_EQ_256-DAG: ldr q[[VAL_HI:[0-9]+]], [sp, #16] +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].d, z[[VAL_HI]].s +; VBITS_EQ_256-DAG: st1w { [[UPK1_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] %vals = load <8 x i32>, <8 x i32>* %a %ptrs = load <8 x i32*>, <8 x i32*>* %b %mask = icmp eq <8 x i32> %vals, zeroinitializer @@ -427,25 +430,26 @@ define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 { define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i64: -; VBITS_EQ_256: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x8, x0, #32 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1d { [[VALS_LO:z[0-9]+]].d }, [[PG0]]/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { [[VALS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq [[MASK_HI:p[0-9]+]].d, [[PG0]]/z, [[VALS_HI]].d, #0 -; VBITS_EQ_256-NEXT: cmpeq [[MASK_LO:p[0-9]+]].d, [[PG0]]/z, [[VALS_LO]].d, #0 -; VBITS_EQ_256-NEXT: st1d { [[VALS_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] -; VBITS_EQ_256-NEXT: st1d { [[VALS_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0 ; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[VALS_LO:z[0-9]+]].d }, [[PG0]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[VALS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x9] +; VBITS_EQ_256-DAG: cmpeq [[MASK_LO:p[0-9]+]].d, [[PG0]]/z, [[VALS_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpeq [[MASK_HI:p[0-9]+]].d, [[PG0]]/z, [[VALS_HI]].d, #0 +; VBITS_EQ_256-DAG: st1d { [[VALS_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: st1d { [[VALS_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] +; VBITS_EQ_256-NEXT: ret %vals = load <8 x i64>, <8 x i64>* %a %ptrs = load <8 x i64*>, <8 x i64*>* %b %mask = icmp eq <8 x i64> %vals, zeroinitializer diff --git a/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll index 5db9d4008d5..7ba6cd438e5 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll @@ -24,7 +24,7 @@ define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v2i64i8 ; CHECK: ldr q[[Q0:[0-9]+]], [x0] ; CHECK: ptrue p[[P0:[0-9]+]].d, vl2 -; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}] +; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1] ; CHECK-NEXT: ret %a = load <2 x i64>, <2 x i64>* %ap %val = trunc <2 x i64> %a to <2 x i8> @@ -36,7 +36,7 @@ define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v4i64i8 ; CHECK: ptrue p[[P0:[0-9]+]].d, vl4 ; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}] +; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1] ; CHECK-NEXT: ret %a = load <4 x i64>, <4 x i64>* %ap %val = trunc <4 x i64> %a to <4 x i8> @@ -48,20 +48,21 @@ define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v8i64i8: ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4 -; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s -; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s -; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s -; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8 -; VBITS_EQ_256-DAG: st1b { [[Z1]].s }, [[PG]], [x1] -; VBITS_EQ_256-DAG: ret +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s +; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG2]], [[WORDS_LO]].s, [[WORDS_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8 +; VBITS_EQ_256-NEXT: st1b { [[WORDS]].s }, [[PG3]], [x1] +; VBITS_EQ_256-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %val = trunc <8 x i64> %a to <8 x i8> store <8 x i8> %val, <8 x i8>* %dest @@ -72,7 +73,7 @@ define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v16i64i8: ; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <16 x i64>, <16 x i64>* %ap %val = trunc <16 x i64> %a to <16 x i8> @@ -84,7 +85,7 @@ define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v32i64i8: ; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i64>, <32 x i64>* %ap %val = trunc <32 x i64> %a to <32 x i8> @@ -96,21 +97,22 @@ define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 { ; CHECK-LABEL: store_trunc_v8i64i16: ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. ; Currently does not use the truncating store -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s -; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s -; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h -; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h -; VBITS_EQ_256-DAG: mov v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0] -; VBITS_EQ_256-DAG: str q[[V0]], [x1] -; VBITS_EQ_256-DAG: ret +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s +; VBITS_EQ_256-DAG: uzp1 z[[HALFS_LO:[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[HALFS_HI:[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h +; VBITS_EQ_256-NEXT: mov v[[HALFS_LO]].d[1], v[[HALFS_HI]].d[0] +; VBITS_EQ_256-NEXT: str q[[HALFS_LO]], [x1] +; VBITS_EQ_256-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %val = trunc <8 x i64> %a to <8 x i16> store <8 x i16> %val, <8 x i16>* %dest @@ -121,20 +123,21 @@ define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 { ; CHECK-LABEL: store_trunc_v8i64i32: ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4 -; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s -; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s -; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s -; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8 -; VBITS_EQ_256-DAG: st1w { [[Z1]].s }, [[PG]], [x1] -; VBITS_EQ_256-DAG: ret +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 +; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s +; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s +; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG1]], [[WORDS_LO]].s, [[WORDS_HI]].s +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8 +; VBITS_EQ_256-NEXT: st1w { [[WORDS]].s }, [[PG3]], [x1] +; VBITS_EQ_256-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %val = trunc <8 x i64> %a to <8 x i32> store <8 x i32> %val, <8 x i32>* %dest @@ -145,21 +148,22 @@ define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v16i32i8: ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x1] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. ; Currently does not use the truncating store -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: ld1w { [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-DAG: ld1w { [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h -; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h -; VBITS_EQ_256-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b -; VBITS_EQ_256-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b -; VBITS_EQ_256-DAG: mov v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0] -; VBITS_EQ_256-DAG: str q[[V0]], [x1] -; VBITS_EQ_256-DAG: ret +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h +; VBITS_EQ_256-DAG: uzp1 z[[BYTES_LO:[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b +; VBITS_EQ_256-DAG: uzp1 z[[BYTES_HI:[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b +; VBITS_EQ_256-NEXT: mov v[[BYTES_LO]].d[1], v[[BYTES_HI]].d[0] +; VBITS_EQ_256-NEXT: str q[[BYTES_LO]], [x1] +; VBITS_EQ_256-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %val = trunc <16 x i32> %a to <16 x i8> store <16 x i8> %val, <16 x i8>* %dest @@ -170,20 +174,21 @@ define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 { ; CHECK-LABEL: store_trunc_v16i32i16: ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x1] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: ld1w { [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-DAG: ld1w { [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl8 -; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h -; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h -; VBITS_EQ_256-DAG: splice [[Z1]].h, [[PG]], [[Z1]].h, [[Z0]].h -; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl16 -; VBITS_EQ_256-DAG: st1h { [[Z1]].h }, [[PG]], [x1] -; VBITS_EQ_256-DAG: ret +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].h, vl8 +; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h +; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h +; VBITS_EQ_256-DAG: splice [[HALFS:z[0-9]+]].h, [[PG2]], [[HALFS_LO]].h, [[HALFS_HI]].h +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl16 +; VBITS_EQ_256-NEXT: st1h { [[HALFS]].h }, [[PG3]], [x1] +; VBITS_EQ_256-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %val = trunc <16 x i32> %a to <16 x i16> store <16 x i16> %val, <16 x i16>* %dest @@ -194,25 +199,25 @@ define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 { ; CHECK-LABEL: store_trunc_v32i16i8: ; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] -; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x1] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-DAG: ld1h { [[Z1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl16 -; VBITS_EQ_256-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b -; VBITS_EQ_256-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b -; VBITS_EQ_256-DAG: splice [[Z1]].b, [[PG]], [[Z1]].b, [[Z0]].b -; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl32 -; VBITS_EQ_256-DAG: st1b { [[Z1]].b }, [[PG]], [x1] -; VBITS_EQ_256-DAG: ret +; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: ld1h { [[HALFS_LO:z[0-9]+]].h }, [[PG1]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[HALFS_HI:z[0-9]+]].h }, [[PG1]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].b, vl16 +; VBITS_EQ_256-DAG: uzp1 [[BYTES_LO:z[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b +; VBITS_EQ_256-DAG: uzp1 [[BYTES_HI:z[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b +; VBITS_EQ_256-DAG: splice [[BYTES:z[0-9]+]].b, [[PG2]], [[BYTES_LO]].b, [[BYTES_HI]].b +; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].b, vl32 +; VBITS_EQ_256-NEXT: st1b { [[BYTES]].b }, [[PG3]], [x1] +; VBITS_EQ_256-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = trunc <32 x i16> %a to <32 x i8> store <32 x i8> %val, <32 x i8>* %dest ret void } - attributes #0 = { "target-features"="+sve" } diff --git a/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll index 00fb4a38ad7..a12faea0102 100644 --- a/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -61,21 +61,6 @@ define void @shuffle_ext_byone_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v64i8 -; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl32 -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x8] -; VBITS_EQ_256-NEXT: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x8] -; VBITS_EQ_256-NEXT: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].b, [[OP1_HI]].b[31] -; VBITS_EQ_256-NEXT: fmov [[TMP1:w[0-9]+]], s[[ELEM1]] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].b, [[OP2_LO]].b[31] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].b, [[TMP1]] -; VBITS_EQ_256-NEXT: fmov [[TMP2:w[0-9]+]], s[[ELEM2]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].b, [[TMP2]] -; VBITS_EQ_256-NEXT: st1b { [[OP2_HI]].b }, [[PG]], [x0, x8] -; VBITS_EQ_256-NEXT: st1b { [[OP2_LO]].b }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 ; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] @@ -84,6 +69,22 @@ define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].b, [[TMP]] ; VBITS_GE_512-NEXT: st1b { [[OP2]].b }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 +; VBITS_EQ_256-DAG: mov w8, #32 +; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x8] +; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x8] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].b, [[OP1_HI]].b[31] +; VBITS_EQ_256-DAG: fmov [[TMP1:w[0-9]+]], s[[ELEM1]] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].b, [[OP2_LO]].b[31] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].b, [[TMP1]] +; VBITS_EQ_256-DAG: fmov [[TMP2:w[0-9]+]], s[[ELEM2]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].b, [[TMP2]] +; VBITS_EQ_256-DAG: st1b { [[OP2_LO]].b }, [[PG]], [x0] +; VBITS_EQ_256-DAG: st1b { [[OP2_HI]].b }, [[PG]], [x0, x8] +; VBITS_EQ_256-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %ret = shufflevector <64 x i8> %op1, <64 x i8> %op2, <64 x i32> * %a, <16 x i16>* %b) #0 { define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v32i16 -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].h, [[OP1_HI]].h[15] -; VBITS_EQ_256-NEXT: fmov [[TMP1:w[0-9]+]], s[[ELEM1]] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].h, [[OP2_LO]].h[15] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].h, [[TMP1]] -; VBITS_EQ_256-NEXT: fmov [[TMP2:w[0-9]+]], s[[ELEM2]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].h, [[TMP2]] -; VBITS_EQ_256-NEXT: st1h { [[OP2_HI]].h }, [[PG]], [x8] -; VBITS_EQ_256-NEXT: st1h { [[OP2_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] @@ -242,6 +227,23 @@ define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].h, [[TMP]] ; VBITS_GE_512-NEXT: st1h { [[OP2]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].h, [[OP1_HI]].h[15] +; VBITS_EQ_256-DAG: fmov [[TMP1:w[0-9]+]], s[[ELEM1]] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].h, [[OP2_LO]].h[15] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].h, [[TMP1]] +; VBITS_EQ_256-DAG: fmov [[TMP2:w[0-9]+]], s[[ELEM2]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].h, [[TMP2]] +; VBITS_EQ_256-DAG: st1h { [[OP2_LO]].h }, [[PG]], [x0] +; VBITS_EQ_256-DAG: st1h { [[OP2_HI]].h }, [[PG]], [x8] +; VBITS_EQ_256-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %ret = shufflevector <32 x i16> %op1, <32 x i16> %op2, <32 x i32> * %a, <8 x i32>* %b) #0 { define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v16i32 -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].s, [[OP1_HI]].s[7] -; VBITS_EQ_256-NEXT: fmov [[TMP1:w[0-9]+]], s[[ELEM1]] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].s, [[OP2_LO]].s[7] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].s, [[TMP1]] -; VBITS_EQ_256-NEXT: fmov [[TMP2:w[0-9]+]], s[[ELEM2]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].s, [[TMP2]] -; VBITS_EQ_256-NEXT: st1w { [[OP2_HI]].s }, [[PG]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[OP2_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] @@ -371,6 +357,24 @@ define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].s, [[TMP]] ; VBITS_GE_512-NEXT: st1w { [[OP2]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].s, [[OP1_HI]].s[7] +; VBITS_EQ_256-DAG: fmov [[TMP1:w[0-9]+]], s[[ELEM1]] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].s, [[OP2_LO]].s[7] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].s, [[TMP1]] +; VBITS_EQ_256-DAG: fmov [[TMP2:w[0-9]+]], s[[ELEM2]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].s, [[TMP2]] +; VBITS_EQ_256-DAG: st1w { [[OP2_LO]].s }, [[PG]], [x0] +; VBITS_EQ_256-DAG: st1w { [[OP2_HI]].s }, [[PG]], [x8] +; VBITS_EQ_256-DAG: ret + %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %ret = shufflevector <16 x i32> %op1, <16 x i32> %op2, <16 x i32> * %a, <4 x i64>* %b) #0 { define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v8i64 -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].d, [[OP1_HI]].d[3] -; VBITS_EQ_256-NEXT: fmov [[TMP1:x[0-9]+]], d[[ELEM1]] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].d, [[OP2_LO]].d[3] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].d, [[TMP1]] -; VBITS_EQ_256-NEXT: fmov [[TMP2:x[0-9]+]], d[[ELEM2]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].d, [[TMP2]] -; VBITS_EQ_256-NEXT: st1d { [[OP2_HI]].d }, [[PG]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[OP2_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] @@ -477,6 +465,23 @@ define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].d, [[TMP]] ; VBITS_GE_512-NEXT: st1d { [[OP2]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].d, [[OP1_HI]].d[3] +; VBITS_EQ_256-DAG: fmov [[TMP1:x[0-9]+]], d[[ELEM1]] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].d, [[OP2_LO]].d[3] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].d, [[TMP1]] +; VBITS_EQ_256-DAG: fmov [[TMP2:x[0-9]+]], d[[ELEM2]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].d, [[TMP2]] +; VBITS_EQ_256-DAG: st1d { [[OP2_LO]].d }, [[PG]], [x0] +; VBITS_EQ_256-DAG: st1d { [[OP2_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %op2 = load <8 x i64>, <8 x i64>* %b %ret = shufflevector <8 x i64> %op1, <8 x i64> %op2, <8 x i32> @@ -561,20 +566,6 @@ define void @shuffle_ext_byone_v16f16(<16 x half>* %a, <16 x half>* %b) #0 { define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v32f16 -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].h, [[OP1_HI]].h[15] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].h, [[OP2_LO]].h[15] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].h, h[[ELEM1]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].h, h[[ELEM2]] -; VBITS_EQ_256-NEXT: st1h { [[OP2_HI]].h }, [[PG]], [x8] -; VBITS_EQ_256-NEXT: st1h { [[OP2_LO]].h }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] @@ -582,6 +573,21 @@ define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].h, h[[ELEM]] ; VBITS_GE_512-NEXT: st1h { [[OP2]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x9] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].h, [[OP2_LO]].h[15] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].h, [[OP1_HI]].h[15] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].h, h[[ELEM1]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].h, h[[ELEM2]] +; VBITS_EQ_256-DAG: st1h { [[OP2_LO]].h }, [[PG]], [x0] +; VBITS_EQ_256-DAG: st1h { [[OP2_HI]].h }, [[PG]], [x8] +; VBITS_EQ_256-NEXT: ret %op1 = load <32 x half>, <32 x half>* %a %op2 = load <32 x half>, <32 x half>* %b %ret = shufflevector <32 x half> %op1, <32 x half> %op2, <32 x i32> * %a, <8 x float>* %b) #0 { define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v16f32 -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].s, [[OP1_HI]].s[7] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].s, [[OP2_LO]].s[7] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].s, s[[ELEM1]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].s, s[[ELEM2]] -; VBITS_EQ_256-NEXT: st1w { [[OP2_HI]].s }, [[PG]], [x8] -; VBITS_EQ_256-NEXT: st1w { [[OP2_LO]].s }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] @@ -707,6 +699,21 @@ define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].s, s[[ELEM]] ; VBITS_GE_512-NEXT: st1w { [[OP2]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x9] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].s, [[OP2_LO]].s[7] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].s, [[OP1_HI]].s[7] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].s, s[[ELEM1]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].s, s[[ELEM2]] +; VBITS_EQ_256-DAG: st1w { [[OP2_LO]].s }, [[PG]], [x0] +; VBITS_EQ_256-DAG: st1w { [[OP2_HI]].s }, [[PG]], [x8] +; VBITS_EQ_256-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %op2 = load <16 x float>, <16 x float>* %b %ret = shufflevector <16 x float> %op1, <16 x float> %op2, <16 x i32> * %a, <4 x double>* %b) #0 { define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v8f64 -; VBITS_EQ_256: add x8, x0, #32 -; VBITS_EQ_256-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-NEXT: add x9, x1, #32 -; VBITS_EQ_256-NEXT: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x8] -; VBITS_EQ_256-NEXT: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x9] -; VBITS_EQ_256-NEXT: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] -; VBITS_EQ_256-NEXT: mov z[[ELEM1:[0-9]+]].d, [[OP1_HI]].d[3] -; VBITS_EQ_256-NEXT: mov z[[ELEM2:[0-9]+]].d, [[OP2_LO]].d[3] -; VBITS_EQ_256-NEXT: insr [[OP2_LO]].d, d[[ELEM1]] -; VBITS_EQ_256-NEXT: insr [[OP2_HI]].d, d[[ELEM2]] -; VBITS_EQ_256-NEXT: st1d { [[OP2_HI]].d }, [[PG]], [x8] -; VBITS_EQ_256-NEXT: st1d { [[OP2_LO]].d }, [[PG]], [x0] -; VBITS_EQ_256-NEXT: ret - ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] @@ -809,6 +802,21 @@ define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { ; VBITS_GE_512-NEXT: insr [[OP2]].d, d[[ELEM]] ; VBITS_GE_512-NEXT: st1d { [[OP2]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x8, x0, #32 +; VBITS_EQ_256-DAG: add x9, x1, #32 +; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x8] +; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x9] +; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: mov z[[ELEM1:[0-9]+]].d, [[OP1_HI]].d[3] +; VBITS_EQ_256-DAG: mov z[[ELEM2:[0-9]+]].d, [[OP2_LO]].d[3] +; VBITS_EQ_256-DAG: insr [[OP2_LO]].d, d[[ELEM1]] +; VBITS_EQ_256-DAG: insr [[OP2_HI]].d, d[[ELEM2]] +; VBITS_EQ_256-DAG: st1d { [[OP2_HI]].d }, [[PG]], [x8] +; VBITS_EQ_256-DAG: st1d { [[OP2_LO]].d }, [[PG]], [x0] +; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %op2 = load <8 x double>, <8 x double>* %b %ret = shufflevector <8 x double> %op1, <8 x double> %op2, <8 x i32>