From 170d2a8c252ac7cb78e3d9b75fcaaa591700eae0 Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Wed, 20 Feb 2013 21:33:32 +0000 Subject: [PATCH] DAGCombiner: Fold pointless truncate, bitcast, buildvector series (2xi32) (truncate ((2xi64) bitcast (buildvector i32 a, i32 x, i32 b, i32 y))) can be folded into a (2xi32) (buildvector i32 a, i32 b). Such a DAG would cause uneccessary vdup instructions followed by vmovn instructions. We generate this code on ARM NEON for a setcc olt, 2xf64, 2xf64. For example, in the vectorized version of the code below. double A[N]; double B[N]; void test_double_compare_to_double() { int i; for(i=0;i Opnds; + for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) + Opnds.push_back(BuildVect.getOperand(i)); + + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, &Opnds[0], + Opnds.size()); + } + } + // See if we can simplify the input to this truncate through knowledge that // only the low bits are being used. // For example "trunc (or (shl x, 8), y)" // -> trunc y diff --git a/test/CodeGen/ARM/neon_cmp.ll b/test/CodeGen/ARM/neon_cmp.ll new file mode 100644 index 00000000000..046b5da2289 --- /dev/null +++ b/test/CodeGen/ARM/neon_cmp.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s +; bug 15283 +; radar://13191881 +; CHECK: vfcmp +define void @vfcmp(<2 x double>* %a, <2 x double>* %b) { + %wide.load = load <2 x double>* %a, align 4 + %wide.load2 = load <2 x double>* %b, align 4 +; CHECK-NOT: vdup.32 +; CHECK-NOT: vmovn.i64 + %v1 = fcmp olt <2 x double> %wide.load, %wide.load2 + %v2 = zext <2 x i1> %v1 to <2 x i32> + %v3 = sitofp <2 x i32> %v2 to <2 x double> + store <2 x double> %v3, <2 x double>* %b, align 4 + ret void +}