From 63acdcbdebad04d9af4c56ad7387d97575e7f5a3 Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Tue, 5 Mar 2024 11:11:42 -0500 Subject: [PATCH] Assume UTF-8 when no encoding can be looked up. This will make HTML filtering and `replace=` filter option less likely to be bypassed by uBO, as the body response filterer previously required an encoding to be expressly declared before acting on the response body. UTF-8 usage is currently reported as ~98.2%: https://w3techs.com/technologies/history_overview/character_encoding --- src/js/traffic.js | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/js/traffic.js b/src/js/traffic.js index 02043ac1e..cb56063f4 100644 --- a/src/js/traffic.js +++ b/src/js/traffic.js @@ -749,7 +749,7 @@ const bodyFilterer = (( ) => { /* t */ if ( bytes[i+6] !== 0x74 ) { continue; } break; } - if ( (i - 40) >= 65536 ) { return; } + if ( (i + 40) >= 65536 ) { return; } i += 8; // find first alpha character let j = -1; @@ -827,13 +827,17 @@ const bodyFilterer = (( ) => { } if ( this.status !== 'finishedtransferringdata' ) { return; } - // If encoding is still unknown, try to extract from stream data + // If encoding is still unknown, try to extract from stream data. + // Just assume utf-8 if ultimately no encoding can be looked up. if ( session.charset === undefined ) { const charsetFound = charsetFromStream(session.buffer); - if ( charsetFound === undefined ) { return streamClose(session); } - const charsetUsed = textEncode.normalizeCharset(charsetFound); - if ( charsetUsed === undefined ) { return streamClose(session); } - session.charset = charsetUsed; + if ( charsetFound !== undefined ) { + const charsetUsed = textEncode.normalizeCharset(charsetFound); + if ( charsetUsed === undefined ) { return streamClose(session); } + session.charset = charsetUsed; + } else { + session.charset = 'utf-8'; + } } while ( session.jobs.length !== 0 ) {