From db04025351472a4349dc76dc9a3e9f5b6583e7d2 Mon Sep 17 00:00:00 2001
From: oparviai <oparviai@f3a24b6a-cf45-0410-b55a-8c22e2698227>
Date: Sat, 8 Aug 2015 21:00:15 +0000
Subject: [PATCH] - Redesigned quickseek algorithm for improved sound quality
 in quickseek mode - Adaptive integer divider scaling for improved sound
 quality when using integer processing - Version 1.9.1-pre

---
 README.html                         |  76 +++++----
 include/SoundTouch.h                |  60 +++----
 source/SoundTouch/TDStretch.cpp     | 238 ++++++++++++++++++++--------
 source/SoundTouch/TDStretch.h       |  43 +++--
 source/SoundTouch/mmx_optimized.cpp |  24 ++-
 source/SoundTouch/sse_optimized.cpp |   4 +-
 6 files changed, 293 insertions(+), 152 deletions(-)
diff --git a/README.html b/README.html
index ef9aa0e..ff5ad24 100644
--- a/README.html
+++ b/README.html
@@ -9,11 +9,11 @@
   <meta name="description"
  content="Readme file for SoundTouch audio processing library">
   <style> <!-- .normal { font-family: Arial }
-	--></style>
+    --></style>
 </head>
 <body class="normal">
 <hr>
-<h1>SoundTouch audio processing library v1.9</h1>
+<h1>SoundTouch audio processing library v1.9.1-pre</h1>
 <p class="normal">SoundTouch library Copyright © Olli Parviainen 2001-2015</p>
 <hr>
 <h2>1. Introduction </h2>
@@ -60,10 +60,10 @@ the compilation, the target program will require additional vcomp dll library to
 properly run. In Visual C++ 9.0 these libraries can be found in the following 
 folders.</p>
 <ul>
-	<li>x86 32bit: C:\Program Files (x86)\Microsoft Visual Studio 
-	9.0\VC\redist\x86\Microsoft.VC90.OPENMP\vcomp90.dll</li>
-	<li>x64 64bit: C:\Program Files (x86)\Microsoft Visual Studio 
-	9.0\VC\redist\amd64\Microsoft.VC90.OPENMP\vcomp90.dll</li>
+    <li>x86 32bit: C:\Program Files (x86)\Microsoft Visual Studio 
+    9.0\VC\redist\x86\Microsoft.VC90.OPENMP\vcomp90.dll</li>
+    <li>x64 64bit: C:\Program Files (x86)\Microsoft Visual Studio 
+    9.0\VC\redist\amd64\Microsoft.VC90.OPENMP\vcomp90.dll</li>
 </ul>
 <p>In Visual Studio 2008, a SP1 version may be required for these libraries. In 
 other VC++ versions the required library will be expectedly found in similar 
@@ -103,8 +103,8 @@ Notice that "configure" file is not available before running the
       </td>
       <td>
       <p>Builds the SoundTouch library &amp; SoundStretch utility. You can 
-	  optionally add &quot;-j&quot; switch after &quot;make&quot; to speed up the compilation in 
-	  multi-core systems.</p>
+      optionally add &quot;-j&quot; switch after &quot;make&quot; to speed up the compilation in 
+      multi-core systems.</p>
       </td>
     </tr>
     <tr valign="top">
@@ -355,8 +355,8 @@ computation burden</td>
 <h3>3.5 Performance Optimizations </h3>
 <p><strong>General optimizations:</strong></p>
 <p>The time-stretch routine has a 'quick' mode that substantially
-speeds up the algorithm but may degrade the sound quality by a small
-amount. This mode is activated by calling SoundTouch::setSetting()
+speeds up the algorithm but may slightly compromise the sound quality. 
+This mode is activated by calling SoundTouch::setSetting()
 function with parameter&nbsp; id of SETTING_USE_QUICKSEEK and value
 "1", i.e. </p>
 <blockquote>
@@ -368,7 +368,7 @@ intrinsics, providing about a 3x processing speedup for x86 compatible
 processors vs. non-SIMD implementation:</p>
 <ul>
     <li> Intel MMX optimized routines are used with x86 CPUs when 16bit integer 
-	sample type is used</li>
+    sample type is used</li>
   <li> Intel SSE optimized routines are used with x86 CPUs when 32bit floating 
   point sample type is used</li>
 </ul>
@@ -395,17 +395,17 @@ This include for example multi-core embedded devices.</p>
 <p>OpenMP parallel computation can be enabled before compiling SoundTouch 
 library as follows:</p>
 <ul>
-	<li><strong>Visual Studio</strong>: Open properties for the <strong>SoundTouch
-	</strong>sub-project, browse to <strong>C/C++</strong> and <strong>Language 
-	</strong>settings. Set 
-	there &quot;<strong>OpenMP support</strong>&quot; to &quot;<strong>Yes</strong>&quot;. Alternatively add 
-	<strong>/openmp</strong> switch to command-line 
-	parameters</li>
-	<li><strong>GNU</strong>: Run the configure script with &quot;<strong>./configure 
-	--enable-openmp</strong>&quot; switch, then run make as usually</li>
-	<li><strong>Android</strong>: Add &quot;<strong>-fopenmp</strong>&quot; switches to compiler &amp; linker 
-	options, see README-SoundTouch-Android.html in the source code package for 
-	more detailed instructions.</li>
+    <li><strong>Visual Studio</strong>: Open properties for the <strong>SoundTouch
+    </strong>sub-project, browse to <strong>C/C++</strong> and <strong>Language 
+    </strong>settings. Set 
+    there &quot;<strong>OpenMP support</strong>&quot; to &quot;<strong>Yes</strong>&quot;. Alternatively add 
+    <strong>/openmp</strong> switch to command-line 
+    parameters</li>
+    <li><strong>GNU</strong>: Run the configure script with &quot;<strong>./configure 
+    --enable-openmp</strong>&quot; switch, then run make as usually</li>
+    <li><strong>Android</strong>: Add &quot;<strong>-fopenmp</strong>&quot; switches to compiler &amp; linker 
+    options, see README-SoundTouch-Android.html in the source code package for 
+    more detailed instructions.</li>
 </ul>
 <hr>
 <h2><a name="SoundStretch"></a>4. SoundStretch audio processing utility
@@ -566,18 +566,25 @@ this corresponds to lowering the pitch by -0.318 semitones:</p>
 <hr>
 <h2>5. Change History</h2>
 <h3>5.1. SoundTouch library Change History </h3>
+    <p><b>1.9.1-pre:</b></p>
+    <ul>
+        <li>Improved SoundTouch::flush() function so that it returns precisely the desired amount of samples for exact output duration control</li>
+        <li>Redesigned quickseek algorithm for improved sound quality when using the quickseek mode. The new quickseek algorithm can find 99% as good results as the default full-scan mode.</li>
+        <li>Added adaptive integer divider scaling for improved sound quality when using integer processing algorithm
+        </li>
+    </ul>
 <p><b>1.9:</b></p>
 <ul>
     <li>Added support for parallel computation support via OpenMP primitives for better performance in multicore systems. 
         Benchmarks show that achieved parallel processing speedup improvement 
-	typically range from +30% (x86 dual-core) to +180% (ARM quad-core). The 
-	OpenMP optimizations are disabled by default, see OpenMP notes above in this 
-	readme file how to enabled these optimizations.</li>
+    typically range from +30% (x86 dual-core) to +180% (ARM quad-core). The 
+    OpenMP optimizations are disabled by default, see OpenMP notes above in this 
+    readme file how to enabled these optimizations.</li>
     <li>Android: Added support for Android devices featuring X86 and MIPS CPUs, 
-	in addition to ARM CPUs.</li>
-	<li>Android: More versatile Android example application that processes WAV 
-	audio files with SoundTouch library</li>
-	<li>Replaced Windows-like 'BOOL' types with native 'bool'</li>
+    in addition to ARM CPUs.</li>
+    <li>Android: More versatile Android example application that processes WAV 
+    audio files with SoundTouch library</li>
+    <li>Replaced Windows-like 'BOOL' types with native 'bool'</li>
     <li>Changed documentation token to "dist_doc_DATA" in Makefile.am file</li>
     <li>Miscellaneous small fixes and improvements</li>
 </ul>
@@ -816,7 +823,7 @@ submitted bugfixes:</p>
   <li> David Clark</li>
   <li> Patrick Colis</li>
   <li> Miquel Colon</li>
-	<li> Jim Credland</li>
+    <li> Jim Credland</li>
   <li> Sandro Cumerlato</li>
   <li> Justin Frankel</li>
     <li> Masa H.</li>
@@ -827,10 +834,10 @@ submitted bugfixes:</p>
   <li> Yuval Naveh</li>
   <li> Paulo Pizarro</li>
   <li> Blaise Potard</li>
-	<li> Michael Pruett</li>
+    <li> Michael Pruett</li>
   <li> Rajeev Puran</li>
-	<li> RJ Ryan</li>
-	<li> John Sheehy</li>
+    <li> RJ Ryan</li>
+    <li> John Sheehy</li>
   <li> Tim Shuttleworth</li>
   <li> Albert Sirvent</li>
   <li> John Stumpo</li>
@@ -852,7 +859,8 @@ General Public License for more details.</p>
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA</p>
 <hr><!--
-$Id$-->
+$Id$
+-->
     <p>
         <i>README.html file updated in May-2015</i></p>
 </body>
diff --git a/include/SoundTouch.h b/include/SoundTouch.h
index aad41d6..d87abee 100644
--- a/include/SoundTouch.h
+++ b/include/SoundTouch.h
@@ -79,10 +79,10 @@ namespace soundtouch
 {
 
 /// Soundtouch library version string
-#define SOUNDTOUCH_VERSION          "1.9.0"
+#define SOUNDTOUCH_VERSION          "1.9.1-pre"
 
 /// SoundTouch library version id
-#define SOUNDTOUCH_VERSION_ID       (10900)
+#define SOUNDTOUCH_VERSION_ID       (10901)
 
 //
 // Available setting IDs for the 'setSetting' & 'get_setting' functions:
@@ -154,20 +154,20 @@ private:
     double virtualRate;
 
     /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
-	double virtualTempo;
+    double virtualTempo;
 
     /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
-	double virtualPitch;
+    double virtualPitch;
 
     /// Flag: Has sample rate been set?
     bool  bSrateSet;
 
-	/// Accumulator for how many samples in total will be expected as output vs. samples put in,
-	/// considering current processing settings.
-	double samplesExpectedOut;
+    /// Accumulator for how many samples in total will be expected as output vs. samples put in,
+    /// considering current processing settings.
+    double samplesExpectedOut;
 
-	/// Accumulator for how many samples in total have been read out from the processing so far
-	long   samplesOutput;
+    /// Accumulator for how many samples in total have been read out from the processing so far
+    long   samplesOutput;
 
     /// Calculates effective rate & tempo valuescfrom 'virtualRate', 'virtualTempo' and 
     /// 'virtualPitch' parameters.
@@ -199,28 +199,28 @@ public:
 
     /// Sets new tempo control value. Normal tempo = 1.0, smaller values
     /// represent slower tempo, larger faster tempo.
-	void setTempo(double newTempo);
+    void setTempo(double newTempo);
 
     /// Sets new rate control value as a difference in percents compared
     /// to the original rate (-50 .. +100 %)
-	void setRateChange(double newRate);
+    void setRateChange(double newRate);
 
     /// Sets new tempo control value as a difference in percents compared
     /// to the original tempo (-50 .. +100 %)
-	void setTempoChange(double newTempo);
+    void setTempoChange(double newTempo);
 
     /// Sets new pitch control value. Original pitch = 1.0, smaller values
     /// represent lower pitches, larger values higher pitch.
-	void setPitch(double newPitch);
+    void setPitch(double newPitch);
 
     /// Sets pitch change in octaves compared to the original pitch  
     /// (-1.00 .. +1.00)
-	void setPitchOctaves(double newPitch);
+    void setPitchOctaves(double newPitch);
 
     /// Sets pitch change in semi-tones compared to the original pitch
     /// (-12 .. +12)
     void setPitchSemiTones(int newPitch);
-	void setPitchSemiTones(double newPitch);
+    void setPitchSemiTones(double newPitch);
 
     /// Sets the number of channels, 1 = mono, 2 = stereo
     void setChannels(uint numChannels);
@@ -247,22 +247,22 @@ public:
                                                     ///< contains data for both channels.
             );
 
-	/// Output samples from beginning of the sample buffer. Copies requested samples to 
-	/// output buffer and removes them from the sample buffer. If there are less than 
-	/// 'numsample' samples in the buffer, returns all that available.
-	///
-	/// \return Number of samples returned.
-	virtual uint receiveSamples(SAMPLETYPE *output, ///< Buffer where to copy output samples.
-		uint maxSamples                 ///< How many samples to receive at max.
-		);
+    /// Output samples from beginning of the sample buffer. Copies requested samples to 
+    /// output buffer and removes them from the sample buffer. If there are less than 
+    /// 'numsample' samples in the buffer, returns all that available.
+    ///
+    /// \return Number of samples returned.
+    virtual uint receiveSamples(SAMPLETYPE *output, ///< Buffer where to copy output samples.
+        uint maxSamples                 ///< How many samples to receive at max.
+        );
 
-	/// Adjusts book-keeping so that given number of samples are removed from beginning of the 
-	/// sample buffer without copying them anywhere. 
-	///
-	/// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
-	/// with 'ptrBegin' function.
-	virtual uint receiveSamples(uint maxSamples   ///< Remove this many samples from the beginning of pipe.
-		);
+    /// Adjusts book-keeping so that given number of samples are removed from beginning of the 
+    /// sample buffer without copying them anywhere. 
+    ///
+    /// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
+    /// with 'ptrBegin' function.
+    virtual uint receiveSamples(uint maxSamples   ///< Remove this many samples from the beginning of pipe.
+        );
 
     /// Clears all the samples in the object's output and internal processing
     /// buffers.
diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp
index 9ad27aa..d42ee6d 100644
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@@ -63,7 +63,7 @@ using namespace soundtouch;
  *****************************************************************************/
 
 // Table for the hierarchical mixing position seeking algorithm
-static const short _scanOffsets[5][24]={
+const short _scanOffsets[5][24]={
     { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806,
       868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0},
     {-100,  -75,  -50,  -25,   25,   50,   75,  100,    0,    0,    0,   0,
@@ -94,7 +94,9 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
     bAutoSeqSetting = true;
     bAutoSeekSetting = true;
 
-//    outDebt = 0;
+    maxnorm = 0;
+    maxnormf = 1e8;
+
     skipFract = 0;
 
     tempo = 1.0f;
@@ -250,7 +252,7 @@ int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
     if (bQuickSeek) 
     {
         return seekBestOverlapPositionQuick(refPos);
-    } 
+    }
     else 
     {
         return seekBestOverlapPositionFull(refPos);
@@ -282,7 +284,6 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 }
 
 
-
 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the
 // routine
 //
@@ -336,6 +337,11 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
             }
         }
     }
+
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+    adaptNormalizer();
+#endif
+
     // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     clearCrossCorrState();
 
@@ -343,64 +349,161 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
 }
 
 
-// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
-// routine
+// Quick seek algorithm for improved runtime-performance: First roughly scans through the 
+// correlation area, and then scan surroundings of two best preliminary correlation candidates
+// with improved precision
 //
-// The best position is determined as the position where the two overlapped
-// sample sequences are 'most alike', in terms of the highest cross-correlation
-// value over the overlapping period
-int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) 
+// Based on testing:
+// - This algorithm gives on average 99% as good match as the full algorith
+// - this quick seek algorithm finds the best match on ~90% of cases
+// - on those 10% of cases when this algorithm doesn't find best match, 
+//   it still finds on average ~90% match vs. the best possible match
+int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
 {
-    int j;
+#define _MIN(a, b)   (((a) < (b)) ? (a) : (b))
+#define SCANSTEP    16
+#define SCANWIND    8
+
     int bestOffs;
-    double bestCorr, corr;
-    int scanCount, corrOffset, tempOffset;
+    int i;
+    int bestOffs2;
+    float bestCorr, corr;
+    float bestCorr2;
+    double norm;
+
+    // note: 'float' types used in this function in case that the platform would need to use software-fp
 
     bestCorr = FLT_MIN;
-    bestOffs = _scanOffsets[0][0];
-    corrOffset = 0;
-    tempOffset = 0;
+    bestOffs = SCANWIND;
+    bestCorr2 = FLT_MIN;
+    bestOffs2 = 0;
 
-    // Scans for the best correlation value using four-pass hierarchical search.
+    int best = 0;
+
+    // Scans for the best correlation value by testing each possible position
+    // over the permitted range. Look for two best matches on the first pass to
+    // increase possibility of ideal match.
     //
-    // The look-up table 'scans' has hierarchical position adjusting steps.
-    // In first pass the routine searhes for the highest correlation with 
-    // relatively coarse steps, then rescans the neighbourhood of the highest
-    // correlation with better resolution and so on.
-    for (scanCount = 0;scanCount < 4; scanCount ++) 
+    // Begin from "SCANSTEP" instead of SCANWIND to make the calculation
+    // catch the 'middlepoint' of seekLength vector as that's the a-priori 
+    // expected best match position
+    //
+    // Roughly:
+    // - 15% of cases find best result directly on the first round,
+    // - 75% cases find better match on 2nd round around the best match from 1st round
+    // - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round
+    for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP)
     {
-        j = 0;
-        while (_scanOffsets[scanCount][j]) 
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the seek range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
         {
-            double norm;
-            tempOffset = corrOffset + _scanOffsets[scanCount][j];
-            if (tempOffset >= seekLength) break;
-
-            // Calculates correlation value for the mixing position corresponding
-            // to 'tempOffset'
-            corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer, norm);
-            // heuristic rule to slightly favour values close to mid of the range
-            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
-            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
-
-            // Checks for the highest correlation value
-            if (corr > bestCorr) 
-            {
-                bestCorr = corr;
-                bestOffs = tempOffset;
-            }
-            j ++;
+            // found new best match. keep the previous best as 2nd best match
+            bestCorr2 = bestCorr;
+            bestOffs2 = bestOffs;
+            bestCorr = corr;
+            bestOffs = i;
+        }
+        else if (corr > bestCorr2)
+        {
+            // not new best, but still new 2nd best match
+            bestCorr2 = corr;
+            bestOffs2 = i;
         }
-        corrOffset = bestOffs;
     }
+
+    // Scans surroundings of the found best match with small stepping
+    int end = _MIN(bestOffs + SCANWIND + 1, seekLength);
+    for (i = bestOffs - SCANWIND; i < end; i++)
+    {
+        if (i == bestOffs) continue;    // this offset already calculated, thus skip
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
+        {
+            bestCorr = corr;
+            bestOffs = i;
+            best = 1;
+        }
+    }
+
+    // Scans surroundings of the 2nd best match with small stepping
+    end = _MIN(bestOffs2 + SCANWIND + 1, seekLength);
+    for (i = bestOffs2 - SCANWIND; i < end; i++)
+    {
+        if (i == bestOffs2) continue;    // this offset already calculated, thus skip
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
+        {
+            bestCorr = corr;
+            bestOffs = i;
+            best = 2;
+        }
+    }
+
     // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     clearCrossCorrState();
 
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+    adaptNormalizer();
+#endif
+
     return bestOffs;
 }
 
 
 
+
+/// For integer algorithm: adapt normalization factor divider with music so that 
+/// it'll not be pessimistically restrictive that can degrade quality on quieter sections
+/// yet won't cause integer overflows either
+void TDStretch::adaptNormalizer()
+{
+    // Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to
+    // too low values during pauses in music
+    if ((maxnorm > 1000) || (maxnormf > 40000000))
+    { 
+        //norm averaging filter
+        maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm;
+
+        if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16))
+        {
+            // large values, so increase divider
+            overlapDividerBitsNorm++;
+            if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase
+        }
+        else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0))
+        {
+            // extra small values, decrease divider
+            overlapDividerBitsNorm--;
+        }
+    }
+
+    maxnorm = 0;
+}
+
+
 /// clear cross correlation routine state if necessary 
 void TDStretch::clearCrossCorrState()
 {
@@ -422,7 +525,7 @@ void TDStretch::calcSeqParameters()
     #define AUTOSEQ_K           ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEQ_C           (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
 
-    // seek-window-ms setting values at above low & top tempo
+    // seek-window-ms setting values at above low & top tempoq
     #define AUTOSEEK_AT_MIN     25.0
     #define AUTOSEEK_AT_MAX     15.0
     #define AUTOSEEK_K          ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
@@ -736,13 +839,15 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
     // calculate overlap length so that it's power of 2 - thus it's easy to do
     // integer division by right-shifting. Term "-1" at end is to account for 
     // the extra most significatnt bit left unused in result by signed multiplication 
-    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
-    if (overlapDividerBits > 9) overlapDividerBits = 9;
-    if (overlapDividerBits < 3) overlapDividerBits = 3;
-    newOvl = (int)pow(2.0, (int)overlapDividerBits + 1);    // +1 => account for -1 above
+    overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
+    if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9;
+    if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3;
+    newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1);    // +1 => account for -1 above
 
     acceptNewOverlapLength(newOvl);
 
+    overlapDividerBitsNorm = overlapDividerBitsPure;
+
     // calculate sloping divider so that crosscorrelation operation won't 
     // overflow 32-bit register. Max. sum of the crosscorrelation sum without 
     // divider would be 2^30*(N^3-N)/3, where N = overlap length
@@ -750,10 +855,10 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
 }
 
 
-double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm)
 {
     long corr;
-    long lnorm;
+    unsigned long lnorm;
     int i;
 
     corr = lnorm = 0;
@@ -763,15 +868,19 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
     for (i = 0; i < channels * overlapLength; i += 4) 
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;  // notice: do intermediate division here to avoid integer overflow
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;  // notice: do intermediate division here to avoid integer overflow
         corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+                mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
         lnorm += (mixingPos[i] * mixingPos[i] + 
-                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
+                mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
         lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + 
-                  mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
+                mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
     }
 
+    if (lnorm > maxnorm)
+    {
+        maxnorm = lnorm;
+    }
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     norm = (double)lnorm;
@@ -780,17 +889,17 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
 {
     long corr;
-    long lnorm;
+    unsigned long lnorm;
     int i;
 
     // cancel first normalizer tap from previous round
     lnorm = 0;
     for (i = 1; i <= channels; i ++)
     {
-        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBits;
+        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm;
     }
 
     corr = 0;
@@ -800,18 +909,23 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
     for (i = 0; i < channels * overlapLength; i += 4) 
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;  // notice: do intermediate division here to avoid integer overflow
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;  // notice: do intermediate division here to avoid integer overflow
         corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
     }
 
     // update normalizer with last samples of this round
     for (int j = 0; j < channels; j ++)
     {
         i --;
-        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
+        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm;
     }
+
     norm += (double)lnorm;
+    if (norm > maxnorm)
+    {
+        maxnorm = (unsigned long)norm;
+    }
 
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
@@ -896,7 +1010,7 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 
 
 /// Calculate cross-correlation
-double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
 {
     double corr;
     double norm;
@@ -927,7 +1041,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
 {
     double corr;
     int i;
diff --git a/source/SoundTouch/TDStretch.h b/source/SoundTouch/TDStretch.h
index 6400f05..046481b 100644
--- a/source/SoundTouch/TDStretch.h
+++ b/source/SoundTouch/TDStretch.h
@@ -112,39 +112,46 @@ class TDStretch : public FIFOProcessor
 protected:
     int channels;
     int sampleReq;
-    double tempo;
 
-    SAMPLETYPE *pMidBuffer;
-    SAMPLETYPE *pMidBufferUnaligned;
     int overlapLength;
     int seekLength;
     int seekWindowLength;
-    int overlapDividerBits;
+    int overlapDividerBitsNorm;
+    int overlapDividerBitsPure;
     int slopingDivider;
-    double nominalSkip;
-    double skipFract;
-    FIFOSampleBuffer outputBuffer;
-    FIFOSampleBuffer inputBuffer;
-    bool bQuickSeek;
-
     int sampleRate;
     int sequenceMs;
     int seekWindowMs;
     int overlapMs;
+
+    unsigned long maxnorm;
+    float maxnormf;
+
+    double tempo;
+    double nominalSkip;
+    double skipFract;
+
+    bool bQuickSeek;
     bool bAutoSeqSetting;
     bool bAutoSeekSetting;
 
+    SAMPLETYPE *pMidBuffer;
+    SAMPLETYPE *pMidBufferUnaligned;
+
+    FIFOSampleBuffer outputBuffer;
+    FIFOSampleBuffer inputBuffer;
+
     void acceptNewOverlapLength(int newOverlapLength);
 
     virtual void clearCrossCorrState();
     void calculateOverlapLength(int overlapMs);
 
-    virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm) const;
-    virtual double calcCrossCorrAccumulate(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm) const;
+    virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm);
+    virtual double calcCrossCorrAccumulate(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm);
 
     virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos);
     virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos);
-    int seekBestOverlapPosition(const SAMPLETYPE *refPos);
+    virtual int seekBestOverlapPosition(const SAMPLETYPE *refPos);
 
     virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const;
     virtual void overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const;
@@ -154,6 +161,8 @@ protected:
     void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const;
 
     void calcSeqParameters();
+    void adaptNormalizer();
+
 
     /// Changes the tempo of the given sound samples.
     /// Returns amount of samples returned in the "output" buffer.
@@ -249,8 +258,8 @@ public:
     class TDStretchMMX : public TDStretch
     {
     protected:
-        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const;
-        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const;
+        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
+        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
         virtual void overlapStereo(short *output, const short *input) const;
         virtual void clearCrossCorrState();
     };
@@ -262,8 +271,8 @@ public:
     class TDStretchSSE : public TDStretch
     {
     protected:
-        double calcCrossCorr(const float *mixingPos, const float *compare, double &norm) const;
-        double calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const;
+        double calcCrossCorr(const float *mixingPos, const float *compare, double &norm);
+        double calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm);
     };
 
 #endif /// SOUNDTOUCH_ALLOW_SSE
diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp
index cb38d98..17fe108 100644
--- a/source/SoundTouch/mmx_optimized.cpp
+++ b/source/SoundTouch/mmx_optimized.cpp
@@ -68,7 +68,7 @@ using namespace soundtouch;
 
 
 // Calculates cross correlation of two buffers
-double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm) const
+double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
 {
     const __m64 *pVec1, *pVec2;
     __m64 shifter;
@@ -79,7 +79,7 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &d
     pVec1 = (__m64*)pV1;
     pVec2 = (__m64*)pV2;
 
-    shifter = _m_from_int(overlapDividerBits);
+    shifter = _m_from_int(overlapDividerBitsNorm);
     normaccu = accu = _mm_setzero_si64();
 
     // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
@@ -123,6 +123,11 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &d
     // Clear MMS state
     _m_empty();
 
+    if (norm > (long)maxnorm)
+    {
+        maxnorm = norm;
+    }
+
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     dnorm = (double)norm;
@@ -134,7 +139,7 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &d
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm) const
+double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
 {
     const __m64 *pVec1, *pVec2;
     __m64 shifter;
@@ -146,13 +151,13 @@ double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2,
     lnorm = 0;
     for (i = 1; i <= channels; i ++)
     {
-        lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBits;
+        lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBitsNorm;
     }
 
     pVec1 = (__m64*)pV1;
     pVec2 = (__m64*)pV2;
 
-    shifter = _m_from_int(overlapDividerBits);
+    shifter = _m_from_int(overlapDividerBitsNorm);
     accu = _mm_setzero_si64();
 
     // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
@@ -191,10 +196,15 @@ double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2,
     pV1 = (short *)pVec1;
     for (int j = 1; j <= channels; j ++)
     {
-        lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBits;
+        lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBitsNorm;
     }
     dnorm += (double)lnorm;
 
+    if (lnorm > (long)maxnorm)
+    {
+        maxnorm = lnorm;
+    }
+
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm);
@@ -233,7 +243,7 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
 
     // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
     // overlapDividerBits calculation earlier.
-    shifter = _m_from_int(overlapDividerBits + 1);
+    shifter = _m_from_int(overlapDividerBitsPure + 1);
 
     for (i = 0; i < overlapLength / 4; i ++)
     {
diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp
index d1c728e..1e28bc9 100644
--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@@ -71,7 +71,7 @@ using namespace soundtouch;
 #include <math.h>
 
 // Calculates cross correlation of two buffers
-double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm) const
+double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm)
 {
     int i;
     const float *pVec1;
@@ -183,7 +183,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
 
 
 
-double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const
+double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm)
 {
     // call usual calcCrossCorr function because SSE does not show big benefit of 
     // accumulating "norm" value, and also the "norm" rolling algorithm would get