From 63acdcbdebad04d9af4c56ad7387d97575e7f5a3 Mon Sep 17 00:00:00 2001
From: Raymond Hill <rhill@raymondhill.net>
Date: Tue, 5 Mar 2024 11:11:42 -0500
Subject: [PATCH] Assume UTF-8 when no encoding can be looked up.

This will make HTML filtering and `replace=` filter option less
likely to be bypassed by uBO, as the body response filterer
previously required an encoding to be expressly declared before
acting on the response body.

UTF-8 usage is currently reported as ~98.2%:
https://w3techs.com/technologies/history_overview/character_encoding
---
 src/js/traffic.js | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/js/traffic.js b/src/js/traffic.js
index 02043ac1e..cb56063f4 100644
--- a/src/js/traffic.js
+++ b/src/js/traffic.js
@@ -749,7 +749,7 @@ const bodyFilterer = (( ) => {
             /* t */ if ( bytes[i+6] !== 0x74 ) { continue; }
             break;
         }
-        if ( (i - 40) >= 65536 ) { return; }
+        if ( (i + 40) >= 65536 ) { return; }
         i += 8;
         // find first alpha character
         let j = -1;
@@ -827,13 +827,17 @@ const bodyFilterer = (( ) => {
         }
         if ( this.status !== 'finishedtransferringdata' ) { return; }
 
-        // If encoding is still unknown, try to extract from stream data
+        // If encoding is still unknown, try to extract from stream data.
+        // Just assume utf-8 if ultimately no encoding can be looked up.
         if ( session.charset === undefined ) {
             const charsetFound = charsetFromStream(session.buffer);
-            if ( charsetFound === undefined ) { return streamClose(session); }
-            const charsetUsed = textEncode.normalizeCharset(charsetFound);
-            if ( charsetUsed === undefined ) { return streamClose(session); }
-            session.charset = charsetUsed;
+            if ( charsetFound !== undefined ) {
+                const charsetUsed = textEncode.normalizeCharset(charsetFound);
+                if ( charsetUsed === undefined ) { return streamClose(session); }
+                session.charset = charsetUsed;
+            } else {
+                session.charset = 'utf-8';
+            }
         }
 
         while ( session.jobs.length !== 0 ) {