diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp index 8ea11d7..b1fa670 100644 --- a/source/SoundTouch/TDStretch.cpp +++ b/source/SoundTouch/TDStretch.cpp @@ -293,6 +293,7 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) { int bestOffs; double bestCorr, corr; + double norm; int i; bestCorr = FLT_MIN; @@ -300,11 +301,15 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) // Scans for the best correlation value by testing each possible position // over the permitted range. - for (i = 0; i < seekLength; i ++) + bestCorr = calcCrossCorr(refPos, pMidBuffer, norm); + for (i = 1; i < seekLength; i ++) { // Calculates correlation value for the mixing position corresponding - // to 'i' - corr = calcCrossCorr(refPos + channels * i, pMidBuffer); + // to 'i'. Now call "calcCrossCorrAccumulate" that is otherwise same as + // "calcCrossCorr", but saves time by reusing & updating previously stored + // "norm" value + corr = calcCrossCorrAccumulate(refPos + channels * i, pMidBuffer, norm); + // heuristic rule to slightly favour values close to mid of the range double tmp = (double)(2 * i - seekLength) / (double)seekLength; corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); @@ -352,12 +357,13 @@ int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) j = 0; while (_scanOffsets[scanCount][j]) { + double norm; tempOffset = corrOffset + _scanOffsets[scanCount][j]; if (tempOffset >= seekLength) break; // Calculates correlation value for the mixing position corresponding // to 'tempOffset' - corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer); + corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer, norm); // heuristic rule to slightly favour values close to mid of the range double tmp = (double)(2 * tempOffset - seekLength) / seekLength; corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); @@ -729,13 +735,13 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) } -double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) const +double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const { long corr; - long norm; + long lnorm; int i; - corr = norm = 0; + corr = lnorm = 0; // Same routine for stereo and mono. For stereo, unroll loop for better // efficiency and gives slightly better resolution against rounding. // For mono it same routine, just unrolls loop by factor of 4 @@ -745,16 +751,56 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) co mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow corr += (mixingPos[i + 2] * compare[i + 2] + mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; - norm += (mixingPos[i] * mixingPos[i] + - mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow - norm += (mixingPos[i + 2] * mixingPos[i + 2] + - mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits; + lnorm += (mixingPos[i] * mixingPos[i] + + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow + lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + + mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits; } // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation - if (norm == 0) norm = 1; // to avoid div by zero - return (double)corr / sqrt((double)norm); + norm = (double)lnorm; + return (double)corr / sqrt((norm < 1e-9) ? 1.0 : norm); +} + + +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value +double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const +{ + long corr; + long lnorm; + int i; + + // cancel first normalizer tap from previous round + lnorm = 0; + for (i = 1; i <= channels; i ++) + { + lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBits; + } + + corr = 0; + // Same routine for stereo and mono. For stereo, unroll loop for better + // efficiency and gives slightly better resolution against rounding. + // For mono it same routine, just unrolls loop by factor of 4 + for (i = 0; i < channels * overlapLength; i += 4) + { + corr += (mixingPos[i] * compare[i] + + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow + corr += (mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; + } + + // update normalizer with last samples of this round + for (int j = 0; j < channels; j ++) + { + i --; + lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits; + } + norm += (double)lnorm; + + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + return (double)corr / sqrt((norm < 1e-9) ? 1.0 : norm); } #endif // SOUNDTOUCH_INTEGER_SAMPLES @@ -834,10 +880,10 @@ void TDStretch::calculateOverlapLength(int overlapInMsec) } -double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare) const +/// Calculate cross-correlation +double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &norm) const { double corr; - double norm; int i; corr = norm = 0; @@ -859,8 +905,43 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare) co mixingPos[i + 3] * mixingPos[i + 3]; } - if (norm < 1e-9) norm = 1.0; // to avoid div by zero - return corr / sqrt(norm); + return corr / sqrt((norm < 1e-9 ? 1.0 : norm)); } + +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value +double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const +{ + double corr; + int i; + + corr = 0; + + // cancel first normalizer tap from previous round + for (i = 1; i <= channels; i ++) + { + norm -= mixingPos[-i] * mixingPos[-i]; + } + + // Same routine for stereo and mono. For Stereo, unroll by factor of 2. + // For mono it's same routine yet unrollsd by factor of 4. + for (i = 0; i < channels * overlapLength; i += 4) + { + corr += mixingPos[i] * compare[i] + + mixingPos[i + 1] * compare[i + 1] + + mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 3] * compare[i + 3]; + } + + // update normalizer with last samples of this round + for (int j = 0; j < channels; j ++) + { + i --; + norm += mixingPos[i] * mixingPos[i]; + } + + return corr / sqrt((norm < 1e-9 ? 1.0 : norm)); +} + + #endif // SOUNDTOUCH_FLOAT_SAMPLES diff --git a/source/SoundTouch/TDStretch.h b/source/SoundTouch/TDStretch.h index c0f2853..4ce8726 100644 --- a/source/SoundTouch/TDStretch.h +++ b/source/SoundTouch/TDStretch.h @@ -139,7 +139,8 @@ protected: virtual void clearCrossCorrState(); void calculateOverlapLength(int overlapMs); - virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; + virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm) const; + virtual double calcCrossCorrAccumulate(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm) const; virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos); virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos); @@ -248,7 +249,8 @@ public: class TDStretchMMX : public TDStretch { protected: - double calcCrossCorr(const short *mixingPos, const short *compare) const; + double calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const; + double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const; virtual void overlapStereo(short *output, const short *input) const; virtual void clearCrossCorrState(); }; @@ -260,7 +262,8 @@ public: class TDStretchSSE : public TDStretch { protected: - double calcCrossCorr(const float *mixingPos, const float *compare) const; + double calcCrossCorr(const float *mixingPos, const float *compare, double &norm) const; + double calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const; }; #endif /// SOUNDTOUCH_ALLOW_SSE diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp index 03d1805..2d95bfa 100644 --- a/source/SoundTouch/mmx_optimized.cpp +++ b/source/SoundTouch/mmx_optimized.cpp @@ -68,7 +68,7 @@ using namespace soundtouch; // Calculates cross correlation of two buffers -double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const +double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm) const { const __m64 *pVec1, *pVec2; __m64 shifter; @@ -125,14 +125,81 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation - if (norm == 0) norm = 1; // to avoid div by zero + dnorm = (double)norm; - return (double)corr / sqrt((double)norm); + return (double)corr / sqrt(dnorm < 1e-9 ? 1.0 : dnorm); // Note: Warning about the missing EMMS instruction is harmless // as it'll be called elsewhere. } +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value +double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm) const +{ + const __m64 *pVec1, *pVec2; + __m64 shifter; + __m64 accu; + long corr, lnorm; + int i; + + // cancel first normalizer tap from previous round + lnorm = 0; + for (i = 1; i <= channels; i ++) + { + lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBits; + } + + pVec1 = (__m64*)pV1; + pVec2 = (__m64*)pV2; + + shifter = _m_from_int(overlapDividerBits); + accu = _mm_setzero_si64(); + + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples + // during each round for improved CPU-level parallellization. + for (i = 0; i < channels * overlapLength / 16; i ++) + { + __m64 temp; + + // dictionary of instructions: + // _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] + // _mm_add_pi32 : 2*32bit add + // _m_psrad : 32bit right-shift + + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter)); + accu = _mm_add_pi32(accu, temp); + + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter)); + accu = _mm_add_pi32(accu, temp); + + pVec1 += 4; + pVec2 += 4; + } + + // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 + // and finally store the result into the variable "corr" + + accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32)); + corr = _m_to_int(accu); + + // Clear MMS state + _m_empty(); + + // update normalizer with last samples of this round + pV1 = (short *)pVec1; + for (int j = 1; j <= channels; j ++) + { + lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBits; + } + dnorm += (double)lnorm; + + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm); +} + void TDStretchMMX::clearCrossCorrState() { diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp index 7b4e45d..672e58f 100644 --- a/source/SoundTouch/sse_optimized.cpp +++ b/source/SoundTouch/sse_optimized.cpp @@ -71,7 +71,7 @@ using namespace soundtouch; #include // Calculates cross correlation of two buffers -double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const +double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &norm) const { int i; const float *pVec1; @@ -141,11 +141,10 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3] float *pvNorm = (float*)&vNorm; - double norm = sqrt(pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]); - if (norm < 1e-9) norm = 1.0; // to avoid div by zero + norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]); float *pvSum = (float*)&vSum; - return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / norm; + return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm); /* This is approximately corresponding routine in C-language yet without normalization: double corr, norm; @@ -182,6 +181,16 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const } + +double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const +{ + // call usual calcCrossCorr function because SSE does not show big benefit of + // accumulating "norm" value, and also the "norm" rolling algorithm would get + // complicated due to SSE-specific alignment-vs-nonexact correlation rules. + return calcCrossCorr(pV1, pV2, norm); +} + + ////////////////////////////////////////////////////////////////////////////// // // implementation of SSE optimized functions of class 'FIRFilter'