diff --git a/README.html b/README.html index 8cdb998..281145e 100644 --- a/README.html +++ b/README.html @@ -747,6 +747,7 @@ submitted bugfixes since SoundTouch v1.3.1:

  • Sandro Cumerlato
  • Justin Frankel
  • Jason Garland
  • +
  • Masa H.
  • Takashi Iwai
  • Mathias Möhl
  • Yuval Naveh
  • diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp index 3f215dc..8ea11d7 100644 --- a/source/SoundTouch/TDStretch.cpp +++ b/source/SoundTouch/TDStretch.cpp @@ -742,12 +742,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) co for (i = 0; i < channels * overlapLength; i += 4) { corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1] + - mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow + corr += (mixingPos[i + 2] * compare[i + 2] + mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; norm += (mixingPos[i] * mixingPos[i] + - mixingPos[i + 1] * mixingPos[i + 1] + - mixingPos[i + 2] * mixingPos[i + 2] + + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow + norm += (mixingPos[i + 2] * mixingPos[i + 2] + mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits; } diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp index a201b14..03d1805 100644 --- a/source/SoundTouch/mmx_optimized.cpp +++ b/source/SoundTouch/mmx_optimized.cpp @@ -93,19 +93,19 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const // _mm_add_pi32 : 2*32bit add // _m_psrad : 32bit right-shift - temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), - _mm_madd_pi16(pVec1[1], pVec2[1])); - temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), - _mm_madd_pi16(pVec1[1], pVec1[1])); - accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter)); - normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter)); + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter)); + temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter)); + accu = _mm_add_pi32(accu, temp); + normaccu = _mm_add_pi32(normaccu, temp2); - temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), - _mm_madd_pi16(pVec1[3], pVec2[3])); - temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), - _mm_madd_pi16(pVec1[3], pVec1[3])); - accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter)); - normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter)); + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter)); + temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter), + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter)); + accu = _mm_add_pi32(accu, temp); + normaccu = _mm_add_pi32(normaccu, temp2); pVec1 += 4; pVec2 += 4;