Couple of improvements:

- Added normalization to correlation calculation - Heuristic that weights center of the processing window
2024-11-09 20:33:03 +01:00 · 2009-05-17 11:35:13 +00:00 · 2009-05-17 11:35:13 +00:00 · fb966425c4
commit fb966425c4
parent dc4004e0c3
5 changed files with 153 additions and 73 deletions
--- a/README.html
+++ b/README.html
@ -698,6 +698,7 @@ SoundTouch v1.3.1: </p>
  <li>Justin Frankel</li>
  <li>Jason Garland</li>
  <li>Takashi Iwai</li>
+  <li>John Sheehy</li>
 </ul>
 <p >Moral greetings to all other contributors and users also!</p>
 <hr>
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@ -51,6 +51,8 @@
 #include "cpu_detect.h"
 #include "TDStretch.h"

+#include <stdio.h>
+
 using namespace soundtouch;

 #define max(x, y) (((x) > (y)) ? (x) : (y))
@ -85,7 +87,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
 {
    bQuickSeek = FALSE;
    channels = 2;
-    bMidBufferDirty = FALSE;

    pMidBuffer = NULL;
    pRefMidBufferUnaligned = NULL;
@ -94,9 +95,14 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
    bAutoSeqSetting = TRUE;
    bAutoSeekSetting = TRUE;

+//    outDebt = 0;
+    skipFract = 0;
+
    tempo = 1.0f;
    setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
    setTempo(1.0f);
+
+    clear();
 }


@ -129,8 +135,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
    {
        this->sequenceMs = aSequenceMS;
        bAutoSeqSetting = FALSE;
-    } else {
-        // zero or below, use automatic setting
+    } 
+    else if (aSequenceMS == 0)
+    {
+        // if zero, use automatic setting
        bAutoSeqSetting = TRUE;
    }

@ -138,8 +146,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
    {
        this->seekWindowMs = aSeekWindowMS;
        bAutoSeekSetting = FALSE;
-    } else {
-        // zero or below, use automatic setting
+    } 
+    else if (aSeekWindowMS == 0) 
+    {
+        // if zero, use automatic setting
        bAutoSeekSetting = TRUE;
    }

@ -197,11 +207,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const

 void TDStretch::clearMidBuffer()
 {
-    if (bMidBufferDirty) 
-    {
-        memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
-        bMidBufferDirty = FALSE;
-    }
+    memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
 }


@ -216,8 +222,7 @@ void TDStretch::clearInput()
 void TDStretch::clear()
 {
    outputBuffer.clear();
-    inputBuffer.clear();
-    clearMidBuffer();
+    clearInput();
 }


@ -295,7 +300,7 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) 
 {
    int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
    int i;

    // Slopes the amplitudes of the 'midBuffer' samples
@ -310,7 +315,10 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
    {
        // Calculates correlation value for the mixing position corresponding
        // to 'i'
-        corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+        corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+        // heuristic rule to slightly favour values close to mid of the range
+        double tmp = (double)(2 * i - seekLength) / (double)seekLength;
+        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));

        // Checks for the highest correlation value
        if (corr > bestCorr) 
@ -336,7 +344,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 {
    int j;
    int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
    int scanCount, corrOffset, tempOffset;

    // Slopes the amplitude of the 'midBuffer' samples
@ -363,7 +371,10 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)

            // Calculates correlation value for the mixing position corresponding
            // to 'tempOffset'
-            corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+            corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+            // heuristic rule to slightly favour values close to mid of the range
+            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));

            // Checks for the highest correlation value
            if (corr > bestCorr) 
@ -392,7 +403,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) 
 {
    int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
    int tempOffset;
    const SAMPLETYPE *compare;

@ -410,7 +421,10 @@ int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos)

        // Calculates correlation value for the mixing position corresponding
        // to 'tempOffset'
-        corr = calcCrossCorrMono(pRefMidBuffer, compare);
+        corr = (double)calcCrossCorrMono(pRefMidBuffer, compare);
+        // heuristic rule to slightly favour values close to mid of the range
+        double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));

        // Checks for the highest correlation value
        if (corr > bestCorr) 
@ -436,7 +450,7 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
 {
    int j;
    int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
    int scanCount, corrOffset, tempOffset;

    // Slopes the amplitude of the 'midBuffer' samples
@ -463,7 +477,10 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)

            // Calculates correlation value for the mixing position corresponding
            // to 'tempOffset'
-            corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+            corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+            // heuristic rule to slightly favour values close to mid of the range
+            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));

            // Checks for the highest correlation value
            if (corr > bestCorr) 
@ -529,6 +546,10 @@ void TDStretch::calcSeqParameters()

    // Update seek window lengths
    seekWindowLength = (sampleRate * sequenceMs) / 1000;
+    if (seekWindowLength < 2 * overlapLength) 
+    {
+        seekWindowLength = 2 * overlapLength;
+    }
    seekLength = (sampleRate * seekWindowMs) / 1000;
 }

@ -547,11 +568,11 @@ void TDStretch::setTempo(float newTempo)

    // Calculate ideal skip length (according to tempo value) 
    nominalSkip = tempo * (seekWindowLength - overlapLength);
-    skipFract = 0;
    intskip = (int)(nominalSkip + 0.5f);

    // Calculate how many samples are needed in the 'inputBuffer' to 
    // process another batch of samples
+    //sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
    sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
 }

@ -602,6 +623,8 @@ void TDStretch::processNominalTempo()
 }
 */

+#include <stdio.h>
+
 // Processes as many processing frames of the samples 'inputBuffer', store
 // the result into 'outputBuffer'
 void TDStretch::processSamples()
@ -619,22 +642,9 @@ void TDStretch::processSamples()
    }
    */

-    if (bMidBufferDirty == FALSE) 
-    {
-        // if midBuffer is empty, move the first samples of the input stream 
-        // into it
-        if ((int)inputBuffer.numSamples() < overlapLength) 
-        {
-            // wait until we've got overlapLength samples
-            return;
-        }
-        memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE));
-        inputBuffer.receiveSamples((uint)overlapLength);
-        bMidBufferDirty = TRUE;
-    }
-
    // Process samples as long as there are enough samples in 'inputBuffer'
    // to form a processing frame.
+//    while ((int)inputBuffer.numSamples() >= sampleReq - (outDebt / 4)) 
    while ((int)inputBuffer.numSamples() >= sampleReq) 
    {
        // If tempo differs from the normal ('SCALE'), scan for the best overlapping
@ -648,20 +658,33 @@ void TDStretch::processSamples()
        overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
        outputBuffer.putSamples((uint)overlapLength);

-        // ... then copy sequence samples from 'inputBuffer' to output
-        temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe;
-        if (temp > 0)
+        // ... then copy sequence samples from 'inputBuffer' to output:
+        temp = (seekLength / 2 - offset);
+
+        // compensate cumulated output length diff vs. ideal output
+//        temp -= outDebt / 4;
+
+        // update ideal vs. true output difference 
+//        outDebt += temp;
+
+        // length of sequence
+//        temp += (seekWindowLength - 2 * overlapLength);
+        temp = (seekWindowLength - 2 * overlapLength);
+
+        // crosscheck that we don't have buffer overflow...
+        if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
        {
-            outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+            continue;    // just in case, shouldn't really happen
        }

+        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+
        // Copies the end of the current sequence from 'inputBuffer' to 
        // 'midBuffer' for being mixed with the beginning of the next 
        // processing sequence and so on
-        assert(offset + seekWindowLength <= (int)inputBuffer.numSamples());
-        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength), 
+        assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), 
            channels * sizeof(SAMPLETYPE) * overlapLength);
-        bMidBufferDirty = TRUE;

        // Remove the processed samples from the input buffer. Update
        // the difference between integer & nominal skip step to 'skipFract'
@ -701,7 +724,6 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength)
        delete[] pRefMidBufferUnaligned;

        pMidBuffer = new SAMPLETYPE[overlapLength * 2];
-        bMidBufferDirty = TRUE;
        clearMidBuffer();

        pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
@ -842,10 +864,14 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
    int newOvl;

    assert(aoverlapMs >= 0);
-    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0);
+
+    // calculate overlap length so that it's power of 2 - thus it's easy to do
+    // integer division by right-shifting. Term "-1" at end is to account for 
+    // the extra most significatnt bit left unused in result by signed multiplication 
+    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
    if (overlapDividerBits > 9) overlapDividerBits = 9;
-    if (overlapDividerBits < 4) overlapDividerBits = 4;
-    newOvl = (int)pow(2.0, (int)overlapDividerBits);
+    if (overlapDividerBits < 3) overlapDividerBits = 3;
+    newOvl = (int)pow(2.0, (int)overlapDividerBits + 1);    // +1 => account for -1 above

    acceptNewOverlapLength(newOvl);

@ -859,31 +885,41 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
 long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
 {
    long corr;
+    long norm;
    int i;

-    corr = 0;
+    corr = norm = 0;
    for (i = 1; i < overlapLength; i ++) 
    {
        corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
+        norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
    }

-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
 }


 long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
 {
    long corr;
+    long norm;
    int i;

-    corr = 0;
+    corr = norm = 0;
    for (i = 2; i < 2 * overlapLength; i += 2) 
    {
        corr += (mixingPos[i] * compare[i] +
                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
+        norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits;
    }

-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
 }

 #endif // INTEGER_SAMPLES
@ -970,31 +1006,38 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
 {
    double corr;
+    double norm;
    int i;

-    corr = 0;
+    corr = norm = 0;
    for (i = 1; i < overlapLength; i ++) 
    {
        corr += mixingPos[i] * compare[i];
+        norm += mixingPos[i] * mixingPos[i];
    }

-    return corr;
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+    return corr / sqrt(norm);
 }


 double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
 {
    double corr;
+    double norm;
    int i;

-    corr = 0;
+    corr = norm = 0;
    for (i = 2; i < 2 * overlapLength; i += 2) 
    {
        corr += mixingPos[i] * compare[i] +
                mixingPos[i + 1] * compare[i + 1];
+        norm += mixingPos[i] * mixingPos[i] + 
+                mixingPos[i + 1] * mixingPos[i + 1];
    }

-    return corr;
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+    return corr / sqrt(norm);
 }

 #endif // FLOAT_SAMPLES
--- a/source/SoundTouch/TDStretch.h
+++ b/source/SoundTouch/TDStretch.h
@ -4,8 +4,8 @@
 /// while maintaining the original pitch by using a time domain WSOLA-like method 
 /// with several performance-increasing tweaks.
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific file, 
-/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Note : MMX/SSE optimized functions reside in separate, platform-specific files 
+/// 'mmx_optimized.cpp' and 'sse_optimized.cpp'
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@ -52,7 +52,13 @@
 namespace soundtouch
 {

-// Default values for sound processing parameters:
+/// Default values for sound processing parameters:
+/// Notice that the default parameters are tuned for contemporary popular music 
+/// processing. For speech processing applications these parameters suit better:
+///     #define DEFAULT_SEQUENCE_MS     40
+///     #define DEFAULT_SEEKWINDOW_MS   15
+///     #define DEFAULT_OVERLAP_MS      8
+///

 /// Default length of a single processing sequence, in milliseconds. This determines to how 
 /// long sequences the original sound is chopped in the time-stretch algorithm.
@ -62,7 +68,7 @@ namespace soundtouch
 /// and vice versa.
 ///
 /// Increasing this value reduces computational burden & vice versa.
-//#define DEFAULT_SEQUENCE_MS         130
+//#define DEFAULT_SEQUENCE_MS         40
 #define DEFAULT_SEQUENCE_MS         USE_AUTO_SEQUENCE_LEN

 /// Giving this value for the sequence length sets automatic parameter value
@ -81,7 +87,7 @@ namespace soundtouch
 /// around, try reducing this setting.
 ///
 /// Increasing this value increases computational burden & vice versa.
-//#define DEFAULT_SEEKWINDOW_MS       25
+//#define DEFAULT_SEEKWINDOW_MS       15
 #define DEFAULT_SEEKWINDOW_MS       USE_AUTO_SEEKWINDOW_LEN

 /// Giving this value for the seek window length sets automatic parameter value
@ -121,7 +127,8 @@ protected:
    FIFOSampleBuffer outputBuffer;
    FIFOSampleBuffer inputBuffer;
    BOOL bQuickSeek;
-    BOOL bMidBufferDirty;
+//    int outDebt;
+//    BOOL bMidBufferDirty;

    int sampleRate;
    int sequenceMs;
--- a/source/SoundTouch/mmx_optimized.cpp
+++ b/source/SoundTouch/mmx_optimized.cpp
@ -68,6 +68,7 @@ using namespace soundtouch;
 #include "TDStretch.h"
 #include <mmintrin.h>
 #include <limits.h>
+#include <math.h>


 // Calculates cross correlation of two buffers
@ -75,21 +76,21 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
 {
    const __m64 *pVec1, *pVec2;
    __m64 shifter;
-    __m64 accu;
-    long corr;
+    __m64 accu, normaccu;
+    long corr, norm;
    int i;
   
    pVec1 = (__m64*)pV1;
    pVec2 = (__m64*)pV2;

    shifter = _m_from_int(overlapDividerBits);
-    accu = _mm_setzero_si64();
+    normaccu = accu = _mm_setzero_si64();

    // Process 4 parallel sets of 2 * stereo samples each during each 
    // round to improve CPU-level parallellization.
    for (i = 0; i < overlapLength / 8; i ++)
    {
-        __m64 temp;
+        __m64 temp, temp2;

        // dictionary of instructions:
        // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
@ -98,11 +99,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const

        temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
                            _mm_madd_pi16(pVec1[1], pVec2[1]));
+        temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]),
+                             _mm_madd_pi16(pVec1[1], pVec1[1]));
        accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+        normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));

        temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
                            _mm_madd_pi16(pVec1[3], pVec2[3]));
+        temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]),
+                             _mm_madd_pi16(pVec1[3], pVec1[3]));
        accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+        normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));

        pVec1 += 4;
        pVec2 += 4;
@ -114,10 +121,16 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
    accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
    corr = _m_to_int(accu);

+    normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
+    norm = _m_to_int(normaccu);
+
    // Clear MMS state
    _m_empty();

-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (long)((double)corr * USHRT_MAX / sqrt((double)norm));
    // Note: Warning about the missing EMMS instruction is harmless
    // as it'll be called elsewhere.
 }
@ -154,7 +167,9 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
    mix2  = _mm_add_pi16(mix1, adder);
    adder = _mm_add_pi16(adder, adder);

-    shifter = _m_from_int(overlapDividerBits);
+    // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
+    // overlapDividerBits calculation earlier.
+    shifter = _m_from_int(overlapDividerBits + 1);

    for (i = 0; i < overlapLength / 4; i ++)
    {
--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@ -68,6 +68,7 @@ using namespace soundtouch;

 #include "TDStretch.h"
 #include <xmmintrin.h>
+#include <math.h>

 // Calculates cross correlation of two buffers
 double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
@ -75,7 +76,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
    int i;
    const float *pVec1;
    const __m128 *pVec2;
-    __m128 vSum;
+    __m128 vSum, vNorm;

    // Note. It means a major slow-down if the routine needs to tolerate 
    // unaligned __m128 memory accesses. It's way faster if we can skip 
@ -107,30 +108,43 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
    // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
    pVec1 = (const float*)pV1;
    pVec2 = (const __m128*)pV2;
-    vSum = _mm_setzero_ps();
+    vSum = vNorm = _mm_setzero_ps();

    // Unroll the loop by factor of 4 * 4 operations
    for (i = 0; i < overlapLength / 8; i ++) 
    {
+        __m128 vTemp;
        // vSum += pV1[0..3] * pV2[0..3]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0]));
+        vTemp = _MM_LOAD(pVec1);
+        vSum  = _mm_add_ps(vSum,  _mm_mul_ps(vTemp ,pVec2[0]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));

        // vSum += pV1[4..7] * pV2[4..7]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1]));
+        vTemp = _MM_LOAD(pVec1 + 4);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));

        // vSum += pV1[8..11] * pV2[8..11]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2]));
+        vTemp = _MM_LOAD(pVec1 + 8);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));

        // vSum += pV1[12..15] * pV2[12..15]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3]));
+        vTemp = _MM_LOAD(pVec1 + 12);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));

        pVec1 += 16;
        pVec2 += 4;
    }

    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
+    float *pvNorm = (float*)&vNorm;
+    double norm = sqrt(vNorm.m128_f32[0] + vNorm.m128_f32[1] + vNorm.m128_f32[2] + vNorm.m128_f32[3]);
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+
    float *pvSum = (float*)&vSum;
-    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
+    return (double)(vSum.m128_f32[0] + vSum.m128_f32[1] + vSum.m128_f32[2] + vSum.m128_f32[3]) / norm;

    /* This is approximately corresponding routine in C-language:
    double corr;