diff --git a/README.html b/README.html
index 8cdb998..281145e 100644
--- a/README.html
+++ b/README.html
@@ -747,6 +747,7 @@ submitted bugfixes since SoundTouch v1.3.1:
Sandro Cumerlato
Justin Frankel
Jason Garland
+ Masa H.
Takashi Iwai
Mathias Möhl
Yuval Naveh
diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp
index 3f215dc..8ea11d7 100644
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@@ -742,12 +742,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) co
for (i = 0; i < channels * overlapLength; i += 4)
{
corr += (mixingPos[i] * compare[i] +
- mixingPos[i + 1] * compare[i + 1] +
- mixingPos[i + 2] * compare[i + 2] +
+ mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
+ corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
norm += (mixingPos[i] * mixingPos[i] +
- mixingPos[i + 1] * mixingPos[i + 1] +
- mixingPos[i + 2] * mixingPos[i + 2] +
+ mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
+ norm += (mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
}
diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp
index a201b14..03d1805 100644
--- a/source/SoundTouch/mmx_optimized.cpp
+++ b/source/SoundTouch/mmx_optimized.cpp
@@ -93,19 +93,19 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const
// _mm_add_pi32 : 2*32bit add
// _m_psrad : 32bit right-shift
- temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
- _mm_madd_pi16(pVec1[1], pVec2[1]));
- temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]),
- _mm_madd_pi16(pVec1[1], pVec1[1]));
- accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
- normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
+ temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
+ _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
+ temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter),
+ _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter));
+ accu = _mm_add_pi32(accu, temp);
+ normaccu = _mm_add_pi32(normaccu, temp2);
- temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
- _mm_madd_pi16(pVec1[3], pVec2[3]));
- temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]),
- _mm_madd_pi16(pVec1[3], pVec1[3]));
- accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
- normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
+ temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
+ _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
+ temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter),
+ _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter));
+ accu = _mm_add_pi32(accu, temp);
+ normaccu = _mm_add_pi32(normaccu, temp2);
pVec1 += 4;
pVec2 += 4;