Ignore unknown tokens in urlTokenizer.getTokens()

Given that all tokens extracted from one single URL are potentially iterated multiple times in a single URL-matching cycle, it pays to ignore extracted tokens which are known to not be used anywhere in the static filtering engine. The gain in processing a single network request in the static filtering engine can become especially high when dealing with long and random-looking URLs, which URLs have a high likelihood of containing a majority of tokens which are known to not be in use.
2024-09-15 07:22:28 +02:00 · 2019-04-26 17:14:00 -04:00 · 2019-04-26 17:14:00 -04:00 · 69a43e07c4
commit 69a43e07c4
parent 19ece97b0c
3 changed files with 47 additions and 14 deletions
--- a/src/js/background.js
+++ b/src/js/background.js
@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
        // Read-only
        systemSettings: {
            compiledMagic: 12,  // Increase when compiled format changes
-            selfieMagic: 11     // Increase when selfie format changes
+            selfieMagic: 12     // Increase when selfie format changes
        },

        restoreBackupSettings: {
--- a/src/js/static-net-filtering.js
+++ b/src/js/static-net-filtering.js
@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class {
    }
 };

-FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
+FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;

 registerFilterClass(FilterWildcard2HnAnchored);

@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g;
 const reRegexToken = /[%0-9A-Za-z]{2,}/g;
 const reRegexTokenAbort = /[([]/;
 const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
-const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/;
+const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;

 const badTokens = new Set([
    'com',
@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() {
    this.categories = new Map();
    this.dataFilters = new Map();
    this.filterParser.reset();
+    this.urlTokenizer.resetKnownTokens();

    // This will invalidate all tries
    FilterHostnameDict.reset();
@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() {
    const filterDataHolderId = FilterDataHolder.fid;
    const redirectTypeValue = typeNameToTypeValue.redirect;
    const unserialize = µb.CompiledLineIO.unserialize;
+    const knownTokens = this.urlTokenizer.knownTokens;

    for ( const line of this.goodFilters ) {
        if ( this.badFilters.has(line) ) {
@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() {
                entry.next = bucket;
            }
            this.dataFilters.set(tokenHash, entry);
+            knownTokens[tokenHash & 0xFFFF] = 1;
            continue;
        }

@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() {
            continue;
        }

+        knownTokens[tokenHash & 0xFFFF] = 1;
+
        if ( entry === undefined ) {
            bucket.set(tokenHash, filterFromCompiledData(fdata));
            continue;
@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) {
                discardedCount: this.discardedCount,
                categories: categoriesToSelfie(this.categories),
                dataFilters: dataFiltersToSelfie(this.dataFilters),
+                urlTokenizer: this.urlTokenizer.toSelfie(),
            })
        )
    ]);
@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) {
            this.allowFilterCount = selfie.allowFilterCount;
            this.blockFilterCount = selfie.blockFilterCount;
            this.discardedCount = selfie.discardedCount;
+            this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
            for ( const [ catbits, bucket ] of selfie.categories ) {
                const tokenMap = new Map();
                for ( const [ token, fdata ] of bucket ) {
@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out
        toAdd = new Map(),
        toRemove = new Map();

-    let tokenHashes = this.urlTokenizer.getTokens(),
-        i = 0;
+    const tokenHashes = this.urlTokenizer.getTokens();
+    let i = 0;
    while ( i < 32 ) {
        let tokenHash = tokenHashes[i++];
        if ( tokenHash === 0 ) { break; }
--- a/src/js/utils.js
+++ b/src/js/utils.js
@ -65,6 +65,9 @@
        this._urlOut = '';
        this._tokenized = false;
        this._tokens = [ 0 ];
+
+        this.knownTokens = new Uint8Array(65536);
+        this.resetKnownTokens();
    }

    setURL(url) {
@ -76,6 +79,15 @@
        return this._urlOut;
    }

+    resetKnownTokens() {
+        this.knownTokens.fill(0);
+        this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
+    }
+
    // Tokenize on demand.
    getTokens() {
        if ( this._tokenized ) { return this._tokens; }
@ -92,12 +104,6 @@
        return this._tokens;
    }

-    _appendTokenAt(i, th, ti) {
-        this._tokens[i+0] = th;
-        this._tokens[i+1] = ti;
-        return i + 2;
-    }
-
    tokenHashFromString(s) {
        const l = s.length;
        if ( l === 0 ) { return 0; }
@ -119,9 +125,26 @@
        return s;
    }

+    toSelfie() {
+        return µBlock.base64.encode(
+            this.knownTokens.buffer,
+            this.knownTokens.byteLength
+        );
+    }
+
+    fromSelfie(selfie) {
+        return µBlock.base64.decode(selfie, this.knownTokens.buffer);
+    }
+
    // https://github.com/chrisaljoudi/uBlock/issues/1118
    // We limit to a maximum number of tokens.

+    _appendTokenAt(i, th, ti) {
+        this._tokens[i+0] = th;
+        this._tokens[i+1] = ti;
+        return i + 2;
+    }
+
    _tokenize() {
        const tokens = this._tokens;
        let url = this._urlOut;
@ -131,6 +154,7 @@
            url = url.slice(0, 2048);
            l = 2048;
        }
+        const knownTokens = this.knownTokens;
        const vtc = this._validTokenChars;
        let i = 0, j = 0, v, n, ti, th;
        for (;;) {
@ -148,9 +172,11 @@
                th = th * 64 + v;
                n += 1;
            }
-            tokens[j+0] = th;
-            tokens[j+1] = ti;
-            j += 2;
+            if ( knownTokens[th & 0xFFFF] !== 0 ) {
+                tokens[j+0] = th;
+                tokens[j+1] = ti;
+                j += 2;
+            }
        }
    }
 })();