From 0dab4cc8a00f613586654f56945d4a9a00630f8a Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Fri, 7 Aug 2009 19:30:41 +0000 Subject: [PATCH] Fix support to use NEON for single precision fp math. llvm-svn: 78397 --- lib/Target/ARM/ARMBaseInstrInfo.cpp | 12 +- lib/Target/ARM/ARMInstrNEON.td | 124 +++++++++++++------- test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll | 80 +++++++++++++ 3 files changed, 170 insertions(+), 46 deletions(-) create mode 100644 test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 911b84dc7de..ae28ccbb2de 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -587,7 +587,7 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, } break; case ARM::FSTD: - case ARM::FSTS: + case ARM::FSTS: if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { @@ -610,8 +610,10 @@ ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB, if (I != MBB.end()) DL = I->getDebugLoc(); if (DestRC != SrcRC) { - if (((DestRC == ARM::DPRRegisterClass) && (SrcRC == ARM::DPR_VFP2RegisterClass)) || - ((SrcRC == ARM::DPRRegisterClass) && (DestRC == ARM::DPR_VFP2RegisterClass))) { + if (((DestRC == ARM::DPRRegisterClass) && + (SrcRC == ARM::DPR_VFP2RegisterClass)) || + ((SrcRC == ARM::DPRRegisterClass) && + (DestRC == ARM::DPR_VFP2RegisterClass))) { // Allow copy between DPR and DPR_VFP2. } else { return false; @@ -648,7 +650,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addReg(0).addImm(0)); - } else if (RC == ARM::DPRRegisterClass) { + } else if (RC == ARM::DPRRegisterClass || RC == ARM::DPR_VFP2RegisterClass) { AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FSTD)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0)); @@ -670,7 +672,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (RC == ARM::GPRRegisterClass) { AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) .addFrameIndex(FI).addReg(0).addImm(0)); - } else if (RC == ARM::DPRRegisterClass) { + } else if (RC == ARM::DPRRegisterClass || RC == ARM::DPR_VFP2RegisterClass) { AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FLDD), DestReg) .addFrameIndex(FI).addImm(0)); } else { diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 2e8e0a294f5..7cceea22105 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -334,13 +334,18 @@ class N2VQInt op24_23, bits<2> op21_20, bits<2> op19_18, [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; // Basic 2-register operations, scalar single-precision -class N2VDInts +class N2VDInts op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V; + +class N2VDIntsPat : NEONFPPat<(f32 (OpNode SPR:$a)), - (EXTRACT_SUBREG (COPY_TO_REGCLASS - (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$a, arm_ssubreg_0)), - DPR_VFP2), - arm_ssubreg_0)>; + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)), + arm_ssubreg_0)>; // Narrow 2-register intrinsics. class N2VNInt op24_23, bits<2> op21_20, bits<2> op19_18, @@ -380,15 +385,20 @@ class N3VQ op21_20, bits<4> op11_8, bit op4, } // Basic 3-register operations, scalar single-precision -class N3VDs +class N3VDs op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3V { + let isCommutable = Commutable; +} +class N3VDsPat : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)), - (EXTRACT_SUBREG (COPY_TO_REGCLASS - (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$a, arm_ssubreg_0), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$b, arm_ssubreg_0)), - DPR_VFP2), - arm_ssubreg_0)>; + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)), + arm_ssubreg_0)>; // Basic 3-register intrinsics, both double- and quad-register. class N3VDInt op21_20, bits<4> op11_8, bit op4, @@ -427,18 +437,20 @@ class N3VQMulOp op21_20, bits<4> op11_8, bit op4, (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; // Multiply-Add/Sub operations, scalar single-precision -class N3VDMulOps - : NEONFPPat<(f32 (OpNode SPR:$acc, - (f32 (MulNode SPR:$a, SPR:$b)))), - (EXTRACT_SUBREG (COPY_TO_REGCLASS - (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$acc, arm_ssubreg_0), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$a, arm_ssubreg_0), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), - SPR:$b, arm_ssubreg_0)), - DPR_VFP2), - arm_ssubreg_0)>; +class N3VDMulOps op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode> + : N3V; + +class N3VDMulOpsPat + : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))), + (EXTRACT_SUBREG + (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$acc, arm_ssubreg_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)), + arm_ssubreg_0)>; // Neon 3-argument intrinsics, both double- and quad-register. // The destination register is also used as the first source operand register. @@ -1011,9 +1023,6 @@ defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>; // VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>; -// Vector Add Operations used for single-precision FP -def : N3VDs; - // Vector Multiply Operations. // VMUL : Vector Multiply (integer, polynomial and floating-point) @@ -1036,9 +1045,6 @@ def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8, // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>; -// Vector Multiply Operations used for single-precision FP -def : N3VDs; - // Vector Multiply-Accumulate and Multiply-Subtract Operations. // VMLA : Vector Multiply Accumulate (integer and floating-point) @@ -1060,10 +1066,6 @@ defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>; // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>; -// Vector Multiply-Accumulate/Subtract used for single-precision FP -def : N3VDMulOps; -def : N3VDMulOps; - // Vector Subtract Operations. // VSUB : Vector Subtract (integer and floating-point) @@ -1087,9 +1089,6 @@ defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>; // VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>; -// Vector Sub Operations used for single-precision FP -def : N3VDs; - // Vector Comparisons. // VCEQ : Vector Compare Equal @@ -1453,7 +1452,6 @@ def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", v2f32, v2f32, int_arm_neon_vabsf>; def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", v4f32, v4f32, int_arm_neon_vabsf>; -def : N2VDInts; // VQABS : Vector Saturating Absolute Value defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s", @@ -1492,7 +1490,6 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, (outs QPR:$dst), (ins QPR:$src), NoItinerary, "vneg.f32\t$dst, $src", "", [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; -def : N2VDInts; def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>; def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>; @@ -1906,6 +1903,51 @@ class VREV16Q op19_18, string OpcodeStr, ValueType Ty> def VREV16d8 : VREV16D<0b00, "vrev16.8", v8i8>; def VREV16q8 : VREV16Q<0b00, "vrev16.8", v16i8>; +//===----------------------------------------------------------------------===// +// NEON instructions for single-precision FP math +//===----------------------------------------------------------------------===// + +// These need separate instructions because they must use DPR_VFP2 register +// class which have SPR sub-registers. + +// Vector Add Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd,1>; +def : N3VDsPat; + +// Vector Multiply Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul,1>; +def : N3VDsPat; + +// Vector Multiply-Accumulate/Subtract used for single-precision FP +let neverHasSideEffects = 1 in +def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32,fmul,fadd>; +def : N3VDMulOpsPat; + +let neverHasSideEffects = 1 in +def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32,fmul,fsub>; +def : N3VDMulOpsPat; + +// Vector Sub Operations used for single-precision FP +let neverHasSideEffects = 1 in +def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub,0>; +def : N3VDsPat; + +// Vector Absolute for single-precision FP +let neverHasSideEffects = 1 in +def VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", + v2f32, v2f32, int_arm_neon_vabsf>; +def : N2VDIntsPat; + +// Vector Negate for single-precision FP + +let neverHasSideEffects = 1 in +def VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), NoItinerary, + "vneg.f32\t$dst, $src", "", []>; +def : N2VDIntsPat; + //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll b/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll new file mode 100644 index 00000000000..ee888d4adec --- /dev/null +++ b/test/CodeGen/Thumb2/2009-08-07-NeonFPBug.ll @@ -0,0 +1,80 @@ +; RUN: llvm-as < %s | llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -mattr=+neonfp + + %struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 } + %struct.JHUFF_TBL = type { [17 x i8], [256 x i8], i32 } + %struct.JQUANT_TBL = type { [64 x i16], i32 } + %struct.__sFILEX = type opaque + %struct.__sbuf = type { i8*, i32 } + %struct.anon = type { [8 x i32], [48 x i8] } + %struct.backing_store_info = type { void (%struct.jpeg_common_struct*, %struct.backing_store_info*, i8*, i32, i32)*, void (%struct.jpeg_common_struct*, %struct.backing_store_info*, i8*, i32, i32)*, void (%struct.jpeg_common_struct*, %struct.backing_store_info*)*, %struct.FILE*, [64 x i8] } + %struct.jpeg_color_deconverter = type { void (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*, i8***, i32, i8**, i32)* } + %struct.jpeg_color_quantizer = type { void (%struct.jpeg_decompress_struct*, i32)*, void (%struct.jpeg_decompress_struct*, i8**, i8**, i32)*, void (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*)* } + %struct.jpeg_common_struct = type { %struct.jpeg_error_mgr*, %struct.jpeg_memory_mgr*, %struct.jpeg_progress_mgr*, i32, i32 } + %struct.jpeg_component_info = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.JQUANT_TBL*, i8* } + %struct.jpeg_d_coef_controller = type { void (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*, i8***)*, %struct.jvirt_barray_control** } + %struct.jpeg_d_main_controller = type { void (%struct.jpeg_decompress_struct*, i32)*, void (%struct.jpeg_decompress_struct*, i8**, i32*, i32)* } + %struct.jpeg_d_post_controller = type { void (%struct.jpeg_decompress_struct*, i32)*, void (%struct.jpeg_decompress_struct*, i8***, i32*, i32, i8**, i32*, i32)* } + %struct.jpeg_decomp_master = type { void (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*)*, i32 } + %struct.jpeg_decompress_struct = type { %struct.jpeg_error_mgr*, %struct.jpeg_memory_mgr*, %struct.jpeg_progress_mgr*, i32, i32, %struct.jpeg_source_mgr*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %struct.JQUANT_TBL*], [4 x %struct.JHUFF_TBL*], [4 x %struct.JHUFF_TBL*], i32, %struct.jpeg_component_info*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i16, i16, i32, i8, i32, i32, i32, i32, i32, i8*, i32, [4 x %struct.jpeg_component_info*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %struct.jpeg_decomp_master*, %struct.jpeg_d_main_controller*, %struct.jpeg_d_coef_controller*, %struct.jpeg_d_post_controller*, %struct.jpeg_input_controller*, %struct.jpeg_marker_reader*, %struct.jpeg_entropy_decoder*, %struct.jpeg_inverse_dct*, %struct.jpeg_upsampler*, %struct.jpeg_color_deconverter*, %struct.jpeg_color_quantizer* } + %struct.jpeg_entropy_decoder = type { void (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*, [64 x i16]**)* } + %struct.jpeg_error_mgr = type { void (%struct.jpeg_common_struct*)*, void (%struct.jpeg_common_struct*, i32)*, void (%struct.jpeg_common_struct*)*, void (%struct.jpeg_common_struct*, i8*)*, void (%struct.jpeg_common_struct*)*, i32, %struct.anon, i32, i32, i8**, i32, i8**, i32, i32 } + %struct.jpeg_input_controller = type { i32 (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*)*, i32, i32 } + %struct.jpeg_inverse_dct = type { void (%struct.jpeg_decompress_struct*)*, [10 x void (%struct.jpeg_decompress_struct*, %struct.jpeg_component_info*, i16*, i8**, i32)*] } + %struct.jpeg_marker_reader = type { void (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*)*, [16 x i32 (%struct.jpeg_decompress_struct*)*], i32, i32, i32, i32 } + %struct.jpeg_memory_mgr = type { i8* (%struct.jpeg_common_struct*, i32, i32)*, i8* (%struct.jpeg_common_struct*, i32, i32)*, i8** (%struct.jpeg_common_struct*, i32, i32, i32)*, [64 x i16]** (%struct.jpeg_common_struct*, i32, i32, i32)*, %struct.jvirt_sarray_control* (%struct.jpeg_common_struct*, i32, i32, i32, i32, i32)*, %struct.jvirt_barray_control* (%struct.jpeg_common_struct*, i32, i32, i32, i32, i32)*, void (%struct.jpeg_common_struct*)*, i8** (%struct.jpeg_common_struct*, %struct.jvirt_sarray_control*, i32, i32, i32)*, [64 x i16]** (%struct.jpeg_common_struct*, %struct.jvirt_barray_control*, i32, i32, i32)*, void (%struct.jpeg_common_struct*, i32)*, void (%struct.jpeg_common_struct*)*, i32 } + %struct.jpeg_progress_mgr = type { void (%struct.jpeg_common_struct*)*, i32, i32, i32, i32 } + %struct.jpeg_source_mgr = type { i8*, i32, void (%struct.jpeg_decompress_struct*)*, i32 (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*, i32)*, i32 (%struct.jpeg_decompress_struct*, i32)*, void (%struct.jpeg_decompress_struct*)* } + %struct.jpeg_upsampler = type { void (%struct.jpeg_decompress_struct*)*, void (%struct.jpeg_decompress_struct*, i8***, i32*, i32, i8**, i32*, i32)*, i32 } + %struct.jvirt_barray_control = type { [64 x i16]**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.jvirt_barray_control*, %struct.backing_store_info } + %struct.jvirt_sarray_control = type { i8**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.jvirt_sarray_control*, %struct.backing_store_info } + +define arm_apcscc void @jpeg_idct_float(%struct.jpeg_decompress_struct* nocapture %cinfo, %struct.jpeg_component_info* nocapture %compptr, i16* nocapture %coef_block, i8** nocapture %output_buf, i32 %output_col) nounwind { +entry: + br label %bb + +bb: ; preds = %bb, %entry + %0 = load float* undef, align 4 ; [#uses=1] + %1 = fmul float undef, %0 ; [#uses=2] + %tmp73 = add i32 0, 224 ; [#uses=1] + %scevgep74 = getelementptr i8* null, i32 %tmp73 ; [#uses=1] + %scevgep7475 = bitcast i8* %scevgep74 to float* ; [#uses=1] + %2 = load float* null, align 4 ; [#uses=1] + %3 = fmul float 0.000000e+00, %2 ; [#uses=2] + %4 = fadd float %1, %3 ; [#uses=1] + %5 = fsub float %1, %3 ; [#uses=2] + %6 = fadd float undef, 0.000000e+00 ; [#uses=2] + %7 = fmul float undef, 0x3FF6A09E60000000 ; [#uses=1] + %8 = fsub float %7, %6 ; [#uses=2] + %9 = fsub float %4, %6 ; [#uses=1] + %10 = fadd float %5, %8 ; [#uses=2] + %11 = fsub float %5, %8 ; [#uses=1] + %12 = sitofp i16 undef to float ; [#uses=1] + %13 = fmul float %12, 0.000000e+00 ; [#uses=2] + %14 = sitofp i16 undef to float ; [#uses=1] + %15 = load float* %scevgep7475, align 4 ; [#uses=1] + %16 = fmul float %14, %15 ; [#uses=2] + %17 = fadd float undef, undef ; [#uses=2] + %18 = fadd float %13, %16 ; [#uses=2] + %19 = fsub float %13, %16 ; [#uses=1] + %20 = fadd float %18, %17 ; [#uses=2] + %21 = fsub float %18, %17 ; [#uses=1] + %22 = fmul float %21, 0x3FF6A09E60000000 ; [#uses=1] + %23 = fmul float undef, 0x3FFD906BC0000000 ; [#uses=2] + %24 = fmul float %19, 0x3FF1517A80000000 ; [#uses=1] + %25 = fsub float %24, %23 ; [#uses=1] + %26 = fadd float undef, %23 ; [#uses=1] + %27 = fsub float %26, %20 ; [#uses=3] + %28 = fsub float %22, %27 ; [#uses=2] + %29 = fadd float %25, %28 ; [#uses=1] + %30 = fadd float undef, %20 ; [#uses=1] + store float %30, float* undef, align 4 + %31 = fadd float %10, %27 ; [#uses=1] + store float %31, float* undef, align 4 + %32 = fsub float %10, %27 ; [#uses=1] + store float %32, float* undef, align 4 + %33 = fadd float %11, %28 ; [#uses=1] + store float %33, float* undef, align 4 + %34 = fsub float %9, %29 ; [#uses=1] + store float %34, float* undef, align 4 + br label %bb +}