mirror of
https://github.com/RPCS3/soundtouch.git
synced 2024-11-08 20:12:27 +01:00
Couple of improvements:
- Added normalization to correlation calculation - Heuristic that weights center of the processing window
This commit is contained in:
parent
dc4004e0c3
commit
fb966425c4
@ -698,6 +698,7 @@ SoundTouch v1.3.1: </p>
|
||||
<li>Justin Frankel</li>
|
||||
<li>Jason Garland</li>
|
||||
<li>Takashi Iwai</li>
|
||||
<li>John Sheehy</li>
|
||||
</ul>
|
||||
<p >Moral greetings to all other contributors and users also!</p>
|
||||
<hr>
|
||||
|
@ -51,6 +51,8 @@
|
||||
#include "cpu_detect.h"
|
||||
#include "TDStretch.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
using namespace soundtouch;
|
||||
|
||||
#define max(x, y) (((x) > (y)) ? (x) : (y))
|
||||
@ -85,7 +87,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
|
||||
{
|
||||
bQuickSeek = FALSE;
|
||||
channels = 2;
|
||||
bMidBufferDirty = FALSE;
|
||||
|
||||
pMidBuffer = NULL;
|
||||
pRefMidBufferUnaligned = NULL;
|
||||
@ -94,9 +95,14 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
|
||||
bAutoSeqSetting = TRUE;
|
||||
bAutoSeekSetting = TRUE;
|
||||
|
||||
// outDebt = 0;
|
||||
skipFract = 0;
|
||||
|
||||
tempo = 1.0f;
|
||||
setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
|
||||
setTempo(1.0f);
|
||||
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
@ -129,8 +135,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
|
||||
{
|
||||
this->sequenceMs = aSequenceMS;
|
||||
bAutoSeqSetting = FALSE;
|
||||
} else {
|
||||
// zero or below, use automatic setting
|
||||
}
|
||||
else if (aSequenceMS == 0)
|
||||
{
|
||||
// if zero, use automatic setting
|
||||
bAutoSeqSetting = TRUE;
|
||||
}
|
||||
|
||||
@ -138,8 +146,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
|
||||
{
|
||||
this->seekWindowMs = aSeekWindowMS;
|
||||
bAutoSeekSetting = FALSE;
|
||||
} else {
|
||||
// zero or below, use automatic setting
|
||||
}
|
||||
else if (aSeekWindowMS == 0)
|
||||
{
|
||||
// if zero, use automatic setting
|
||||
bAutoSeekSetting = TRUE;
|
||||
}
|
||||
|
||||
@ -197,11 +207,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
|
||||
|
||||
void TDStretch::clearMidBuffer()
|
||||
{
|
||||
if (bMidBufferDirty)
|
||||
{
|
||||
memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
|
||||
bMidBufferDirty = FALSE;
|
||||
}
|
||||
memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
|
||||
}
|
||||
|
||||
|
||||
@ -216,8 +222,7 @@ void TDStretch::clearInput()
|
||||
void TDStretch::clear()
|
||||
{
|
||||
outputBuffer.clear();
|
||||
inputBuffer.clear();
|
||||
clearMidBuffer();
|
||||
clearInput();
|
||||
}
|
||||
|
||||
|
||||
@ -295,7 +300,7 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
|
||||
int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
|
||||
{
|
||||
int bestOffs;
|
||||
LONG_SAMPLETYPE bestCorr, corr;
|
||||
double bestCorr, corr;
|
||||
int i;
|
||||
|
||||
// Slopes the amplitudes of the 'midBuffer' samples
|
||||
@ -310,7 +315,10 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
|
||||
{
|
||||
// Calculates correlation value for the mixing position corresponding
|
||||
// to 'i'
|
||||
corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
|
||||
corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
|
||||
// heuristic rule to slightly favour values close to mid of the range
|
||||
double tmp = (double)(2 * i - seekLength) / (double)seekLength;
|
||||
corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
|
||||
|
||||
// Checks for the highest correlation value
|
||||
if (corr > bestCorr)
|
||||
@ -336,7 +344,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
|
||||
{
|
||||
int j;
|
||||
int bestOffs;
|
||||
LONG_SAMPLETYPE bestCorr, corr;
|
||||
double bestCorr, corr;
|
||||
int scanCount, corrOffset, tempOffset;
|
||||
|
||||
// Slopes the amplitude of the 'midBuffer' samples
|
||||
@ -363,7 +371,10 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
|
||||
|
||||
// Calculates correlation value for the mixing position corresponding
|
||||
// to 'tempOffset'
|
||||
corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
|
||||
corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
|
||||
// heuristic rule to slightly favour values close to mid of the range
|
||||
double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
|
||||
corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
|
||||
|
||||
// Checks for the highest correlation value
|
||||
if (corr > bestCorr)
|
||||
@ -392,7 +403,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
|
||||
int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos)
|
||||
{
|
||||
int bestOffs;
|
||||
LONG_SAMPLETYPE bestCorr, corr;
|
||||
double bestCorr, corr;
|
||||
int tempOffset;
|
||||
const SAMPLETYPE *compare;
|
||||
|
||||
@ -410,7 +421,10 @@ int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos)
|
||||
|
||||
// Calculates correlation value for the mixing position corresponding
|
||||
// to 'tempOffset'
|
||||
corr = calcCrossCorrMono(pRefMidBuffer, compare);
|
||||
corr = (double)calcCrossCorrMono(pRefMidBuffer, compare);
|
||||
// heuristic rule to slightly favour values close to mid of the range
|
||||
double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
|
||||
corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
|
||||
|
||||
// Checks for the highest correlation value
|
||||
if (corr > bestCorr)
|
||||
@ -436,7 +450,7 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
|
||||
{
|
||||
int j;
|
||||
int bestOffs;
|
||||
LONG_SAMPLETYPE bestCorr, corr;
|
||||
double bestCorr, corr;
|
||||
int scanCount, corrOffset, tempOffset;
|
||||
|
||||
// Slopes the amplitude of the 'midBuffer' samples
|
||||
@ -463,7 +477,10 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
|
||||
|
||||
// Calculates correlation value for the mixing position corresponding
|
||||
// to 'tempOffset'
|
||||
corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
|
||||
corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
|
||||
// heuristic rule to slightly favour values close to mid of the range
|
||||
double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
|
||||
corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
|
||||
|
||||
// Checks for the highest correlation value
|
||||
if (corr > bestCorr)
|
||||
@ -529,6 +546,10 @@ void TDStretch::calcSeqParameters()
|
||||
|
||||
// Update seek window lengths
|
||||
seekWindowLength = (sampleRate * sequenceMs) / 1000;
|
||||
if (seekWindowLength < 2 * overlapLength)
|
||||
{
|
||||
seekWindowLength = 2 * overlapLength;
|
||||
}
|
||||
seekLength = (sampleRate * seekWindowMs) / 1000;
|
||||
}
|
||||
|
||||
@ -547,11 +568,11 @@ void TDStretch::setTempo(float newTempo)
|
||||
|
||||
// Calculate ideal skip length (according to tempo value)
|
||||
nominalSkip = tempo * (seekWindowLength - overlapLength);
|
||||
skipFract = 0;
|
||||
intskip = (int)(nominalSkip + 0.5f);
|
||||
|
||||
// Calculate how many samples are needed in the 'inputBuffer' to
|
||||
// process another batch of samples
|
||||
//sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
|
||||
sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
|
||||
}
|
||||
|
||||
@ -602,6 +623,8 @@ void TDStretch::processNominalTempo()
|
||||
}
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
// Processes as many processing frames of the samples 'inputBuffer', store
|
||||
// the result into 'outputBuffer'
|
||||
void TDStretch::processSamples()
|
||||
@ -619,22 +642,9 @@ void TDStretch::processSamples()
|
||||
}
|
||||
*/
|
||||
|
||||
if (bMidBufferDirty == FALSE)
|
||||
{
|
||||
// if midBuffer is empty, move the first samples of the input stream
|
||||
// into it
|
||||
if ((int)inputBuffer.numSamples() < overlapLength)
|
||||
{
|
||||
// wait until we've got overlapLength samples
|
||||
return;
|
||||
}
|
||||
memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE));
|
||||
inputBuffer.receiveSamples((uint)overlapLength);
|
||||
bMidBufferDirty = TRUE;
|
||||
}
|
||||
|
||||
// Process samples as long as there are enough samples in 'inputBuffer'
|
||||
// to form a processing frame.
|
||||
// while ((int)inputBuffer.numSamples() >= sampleReq - (outDebt / 4))
|
||||
while ((int)inputBuffer.numSamples() >= sampleReq)
|
||||
{
|
||||
// If tempo differs from the normal ('SCALE'), scan for the best overlapping
|
||||
@ -648,20 +658,33 @@ void TDStretch::processSamples()
|
||||
overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
|
||||
outputBuffer.putSamples((uint)overlapLength);
|
||||
|
||||
// ... then copy sequence samples from 'inputBuffer' to output
|
||||
temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe;
|
||||
if (temp > 0)
|
||||
// ... then copy sequence samples from 'inputBuffer' to output:
|
||||
temp = (seekLength / 2 - offset);
|
||||
|
||||
// compensate cumulated output length diff vs. ideal output
|
||||
// temp -= outDebt / 4;
|
||||
|
||||
// update ideal vs. true output difference
|
||||
// outDebt += temp;
|
||||
|
||||
// length of sequence
|
||||
// temp += (seekWindowLength - 2 * overlapLength);
|
||||
temp = (seekWindowLength - 2 * overlapLength);
|
||||
|
||||
// crosscheck that we don't have buffer overflow...
|
||||
if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
|
||||
{
|
||||
outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
|
||||
continue; // just in case, shouldn't really happen
|
||||
}
|
||||
|
||||
outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
|
||||
|
||||
// Copies the end of the current sequence from 'inputBuffer' to
|
||||
// 'midBuffer' for being mixed with the beginning of the next
|
||||
// processing sequence and so on
|
||||
assert(offset + seekWindowLength <= (int)inputBuffer.numSamples());
|
||||
memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength),
|
||||
assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
|
||||
memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength),
|
||||
channels * sizeof(SAMPLETYPE) * overlapLength);
|
||||
bMidBufferDirty = TRUE;
|
||||
|
||||
// Remove the processed samples from the input buffer. Update
|
||||
// the difference between integer & nominal skip step to 'skipFract'
|
||||
@ -701,7 +724,6 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength)
|
||||
delete[] pRefMidBufferUnaligned;
|
||||
|
||||
pMidBuffer = new SAMPLETYPE[overlapLength * 2];
|
||||
bMidBufferDirty = TRUE;
|
||||
clearMidBuffer();
|
||||
|
||||
pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
|
||||
@ -842,10 +864,14 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
|
||||
int newOvl;
|
||||
|
||||
assert(aoverlapMs >= 0);
|
||||
overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0);
|
||||
|
||||
// calculate overlap length so that it's power of 2 - thus it's easy to do
|
||||
// integer division by right-shifting. Term "-1" at end is to account for
|
||||
// the extra most significatnt bit left unused in result by signed multiplication
|
||||
overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
|
||||
if (overlapDividerBits > 9) overlapDividerBits = 9;
|
||||
if (overlapDividerBits < 4) overlapDividerBits = 4;
|
||||
newOvl = (int)pow(2.0, (int)overlapDividerBits);
|
||||
if (overlapDividerBits < 3) overlapDividerBits = 3;
|
||||
newOvl = (int)pow(2.0, (int)overlapDividerBits + 1); // +1 => account for -1 above
|
||||
|
||||
acceptNewOverlapLength(newOvl);
|
||||
|
||||
@ -859,31 +885,41 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
|
||||
long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
|
||||
{
|
||||
long corr;
|
||||
long norm;
|
||||
int i;
|
||||
|
||||
corr = 0;
|
||||
corr = norm = 0;
|
||||
for (i = 1; i < overlapLength; i ++)
|
||||
{
|
||||
corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
|
||||
norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
|
||||
}
|
||||
|
||||
return corr;
|
||||
// Normalize result by dividing by sqrt(norm) - this step is easiest
|
||||
// done using floating point operation
|
||||
if (norm == 0) norm = 1; // to avoid div by zero
|
||||
return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
|
||||
}
|
||||
|
||||
|
||||
long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
|
||||
{
|
||||
long corr;
|
||||
long norm;
|
||||
int i;
|
||||
|
||||
corr = 0;
|
||||
corr = norm = 0;
|
||||
for (i = 2; i < 2 * overlapLength; i += 2)
|
||||
{
|
||||
corr += (mixingPos[i] * compare[i] +
|
||||
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
|
||||
norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits;
|
||||
}
|
||||
|
||||
return corr;
|
||||
// Normalize result by dividing by sqrt(norm) - this step is easiest
|
||||
// done using floating point operation
|
||||
if (norm == 0) norm = 1; // to avoid div by zero
|
||||
return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
|
||||
}
|
||||
|
||||
#endif // INTEGER_SAMPLES
|
||||
@ -970,31 +1006,38 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
|
||||
double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
|
||||
{
|
||||
double corr;
|
||||
double norm;
|
||||
int i;
|
||||
|
||||
corr = 0;
|
||||
corr = norm = 0;
|
||||
for (i = 1; i < overlapLength; i ++)
|
||||
{
|
||||
corr += mixingPos[i] * compare[i];
|
||||
norm += mixingPos[i] * mixingPos[i];
|
||||
}
|
||||
|
||||
return corr;
|
||||
if (norm < 1e-9) norm = 1.0; // to avoid div by zero
|
||||
return corr / sqrt(norm);
|
||||
}
|
||||
|
||||
|
||||
double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
|
||||
{
|
||||
double corr;
|
||||
double norm;
|
||||
int i;
|
||||
|
||||
corr = 0;
|
||||
corr = norm = 0;
|
||||
for (i = 2; i < 2 * overlapLength; i += 2)
|
||||
{
|
||||
corr += mixingPos[i] * compare[i] +
|
||||
mixingPos[i + 1] * compare[i + 1];
|
||||
norm += mixingPos[i] * mixingPos[i] +
|
||||
mixingPos[i + 1] * mixingPos[i + 1];
|
||||
}
|
||||
|
||||
return corr;
|
||||
if (norm < 1e-9) norm = 1.0; // to avoid div by zero
|
||||
return corr / sqrt(norm);
|
||||
}
|
||||
|
||||
#endif // FLOAT_SAMPLES
|
||||
|
@ -4,8 +4,8 @@
|
||||
/// while maintaining the original pitch by using a time domain WSOLA-like method
|
||||
/// with several performance-increasing tweaks.
|
||||
///
|
||||
/// Note : MMX optimized functions reside in a separate, platform-specific file,
|
||||
/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
|
||||
/// Note : MMX/SSE optimized functions reside in separate, platform-specific files
|
||||
/// 'mmx_optimized.cpp' and 'sse_optimized.cpp'
|
||||
///
|
||||
/// Author : Copyright (c) Olli Parviainen
|
||||
/// Author e-mail : oparviai 'at' iki.fi
|
||||
@ -52,7 +52,13 @@
|
||||
namespace soundtouch
|
||||
{
|
||||
|
||||
// Default values for sound processing parameters:
|
||||
/// Default values for sound processing parameters:
|
||||
/// Notice that the default parameters are tuned for contemporary popular music
|
||||
/// processing. For speech processing applications these parameters suit better:
|
||||
/// #define DEFAULT_SEQUENCE_MS 40
|
||||
/// #define DEFAULT_SEEKWINDOW_MS 15
|
||||
/// #define DEFAULT_OVERLAP_MS 8
|
||||
///
|
||||
|
||||
/// Default length of a single processing sequence, in milliseconds. This determines to how
|
||||
/// long sequences the original sound is chopped in the time-stretch algorithm.
|
||||
@ -62,7 +68,7 @@ namespace soundtouch
|
||||
/// and vice versa.
|
||||
///
|
||||
/// Increasing this value reduces computational burden & vice versa.
|
||||
//#define DEFAULT_SEQUENCE_MS 130
|
||||
//#define DEFAULT_SEQUENCE_MS 40
|
||||
#define DEFAULT_SEQUENCE_MS USE_AUTO_SEQUENCE_LEN
|
||||
|
||||
/// Giving this value for the sequence length sets automatic parameter value
|
||||
@ -81,7 +87,7 @@ namespace soundtouch
|
||||
/// around, try reducing this setting.
|
||||
///
|
||||
/// Increasing this value increases computational burden & vice versa.
|
||||
//#define DEFAULT_SEEKWINDOW_MS 25
|
||||
//#define DEFAULT_SEEKWINDOW_MS 15
|
||||
#define DEFAULT_SEEKWINDOW_MS USE_AUTO_SEEKWINDOW_LEN
|
||||
|
||||
/// Giving this value for the seek window length sets automatic parameter value
|
||||
@ -121,7 +127,8 @@ protected:
|
||||
FIFOSampleBuffer outputBuffer;
|
||||
FIFOSampleBuffer inputBuffer;
|
||||
BOOL bQuickSeek;
|
||||
BOOL bMidBufferDirty;
|
||||
// int outDebt;
|
||||
// BOOL bMidBufferDirty;
|
||||
|
||||
int sampleRate;
|
||||
int sequenceMs;
|
||||
|
@ -68,6 +68,7 @@ using namespace soundtouch;
|
||||
#include "TDStretch.h"
|
||||
#include <mmintrin.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
// Calculates cross correlation of two buffers
|
||||
@ -75,21 +76,21 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
|
||||
{
|
||||
const __m64 *pVec1, *pVec2;
|
||||
__m64 shifter;
|
||||
__m64 accu;
|
||||
long corr;
|
||||
__m64 accu, normaccu;
|
||||
long corr, norm;
|
||||
int i;
|
||||
|
||||
pVec1 = (__m64*)pV1;
|
||||
pVec2 = (__m64*)pV2;
|
||||
|
||||
shifter = _m_from_int(overlapDividerBits);
|
||||
accu = _mm_setzero_si64();
|
||||
normaccu = accu = _mm_setzero_si64();
|
||||
|
||||
// Process 4 parallel sets of 2 * stereo samples each during each
|
||||
// round to improve CPU-level parallellization.
|
||||
for (i = 0; i < overlapLength / 8; i ++)
|
||||
{
|
||||
__m64 temp;
|
||||
__m64 temp, temp2;
|
||||
|
||||
// dictionary of instructions:
|
||||
// _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
|
||||
@ -98,11 +99,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
|
||||
|
||||
temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
|
||||
_mm_madd_pi16(pVec1[1], pVec2[1]));
|
||||
temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]),
|
||||
_mm_madd_pi16(pVec1[1], pVec1[1]));
|
||||
accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
|
||||
normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
|
||||
|
||||
temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
|
||||
_mm_madd_pi16(pVec1[3], pVec2[3]));
|
||||
temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]),
|
||||
_mm_madd_pi16(pVec1[3], pVec1[3]));
|
||||
accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
|
||||
normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
|
||||
|
||||
pVec1 += 4;
|
||||
pVec2 += 4;
|
||||
@ -114,10 +121,16 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
|
||||
accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
|
||||
corr = _m_to_int(accu);
|
||||
|
||||
normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
|
||||
norm = _m_to_int(normaccu);
|
||||
|
||||
// Clear MMS state
|
||||
_m_empty();
|
||||
|
||||
return corr;
|
||||
// Normalize result by dividing by sqrt(norm) - this step is easiest
|
||||
// done using floating point operation
|
||||
if (norm == 0) norm = 1; // to avoid div by zero
|
||||
return (long)((double)corr * USHRT_MAX / sqrt((double)norm));
|
||||
// Note: Warning about the missing EMMS instruction is harmless
|
||||
// as it'll be called elsewhere.
|
||||
}
|
||||
@ -154,7 +167,9 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
|
||||
mix2 = _mm_add_pi16(mix1, adder);
|
||||
adder = _mm_add_pi16(adder, adder);
|
||||
|
||||
shifter = _m_from_int(overlapDividerBits);
|
||||
// Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
|
||||
// overlapDividerBits calculation earlier.
|
||||
shifter = _m_from_int(overlapDividerBits + 1);
|
||||
|
||||
for (i = 0; i < overlapLength / 4; i ++)
|
||||
{
|
||||
|
@ -68,6 +68,7 @@ using namespace soundtouch;
|
||||
|
||||
#include "TDStretch.h"
|
||||
#include <xmmintrin.h>
|
||||
#include <math.h>
|
||||
|
||||
// Calculates cross correlation of two buffers
|
||||
double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
|
||||
@ -75,7 +76,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
|
||||
int i;
|
||||
const float *pVec1;
|
||||
const __m128 *pVec2;
|
||||
__m128 vSum;
|
||||
__m128 vSum, vNorm;
|
||||
|
||||
// Note. It means a major slow-down if the routine needs to tolerate
|
||||
// unaligned __m128 memory accesses. It's way faster if we can skip
|
||||
@ -107,30 +108,43 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
|
||||
// Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
|
||||
pVec1 = (const float*)pV1;
|
||||
pVec2 = (const __m128*)pV2;
|
||||
vSum = _mm_setzero_ps();
|
||||
vSum = vNorm = _mm_setzero_ps();
|
||||
|
||||
// Unroll the loop by factor of 4 * 4 operations
|
||||
for (i = 0; i < overlapLength / 8; i ++)
|
||||
{
|
||||
__m128 vTemp;
|
||||
// vSum += pV1[0..3] * pV2[0..3]
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0]));
|
||||
vTemp = _MM_LOAD(pVec1);
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0]));
|
||||
vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
|
||||
|
||||
// vSum += pV1[4..7] * pV2[4..7]
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1]));
|
||||
vTemp = _MM_LOAD(pVec1 + 4);
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
|
||||
vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
|
||||
|
||||
// vSum += pV1[8..11] * pV2[8..11]
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2]));
|
||||
vTemp = _MM_LOAD(pVec1 + 8);
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
|
||||
vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
|
||||
|
||||
// vSum += pV1[12..15] * pV2[12..15]
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3]));
|
||||
vTemp = _MM_LOAD(pVec1 + 12);
|
||||
vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
|
||||
vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
|
||||
|
||||
pVec1 += 16;
|
||||
pVec2 += 4;
|
||||
}
|
||||
|
||||
// return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
|
||||
float *pvNorm = (float*)&vNorm;
|
||||
double norm = sqrt(vNorm.m128_f32[0] + vNorm.m128_f32[1] + vNorm.m128_f32[2] + vNorm.m128_f32[3]);
|
||||
if (norm < 1e-9) norm = 1.0; // to avoid div by zero
|
||||
|
||||
float *pvSum = (float*)&vSum;
|
||||
return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
|
||||
return (double)(vSum.m128_f32[0] + vSum.m128_f32[1] + vSum.m128_f32[2] + vSum.m128_f32[3]) / norm;
|
||||
|
||||
/* This is approximately corresponding routine in C-language:
|
||||
double corr;
|
||||
|
Loading…
Reference in New Issue
Block a user