1
0
mirror of https://github.com/gorhill/uBlock.git synced 2024-11-07 03:12:33 +01:00

Ignore unknown tokens in urlTokenizer.getTokens()

Given that all tokens extracted from one single URL are potentially
iterated multiple times in a single URL-matching cycle, it pays to
ignore extracted tokens which are known to not be used anywhere in
the static filtering engine.

The gain in processing a single network request in the static
filtering engine can become especially high when dealing with
long and random-looking URLs, which URLs have a high likelihood
of containing a majority of tokens which are known to not be in
use.
This commit is contained in:
Raymond Hill 2019-04-26 17:14:00 -04:00
parent 19ece97b0c
commit 69a43e07c4
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
3 changed files with 47 additions and 14 deletions

View File

@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
// Read-only // Read-only
systemSettings: { systemSettings: {
compiledMagic: 12, // Increase when compiled format changes compiledMagic: 12, // Increase when compiled format changes
selfieMagic: 11 // Increase when selfie format changes selfieMagic: 12 // Increase when selfie format changes
}, },
restoreBackupSettings: { restoreBackupSettings: {

View File

@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class {
} }
}; };
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/; FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;
registerFilterClass(FilterWildcard2HnAnchored); registerFilterClass(FilterWildcard2HnAnchored);
@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g;
const reRegexToken = /[%0-9A-Za-z]{2,}/g; const reRegexToken = /[%0-9A-Za-z]{2,}/g;
const reRegexTokenAbort = /[([]/; const reRegexTokenAbort = /[([]/;
const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/; const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
const badTokens = new Set([ const badTokens = new Set([
'com', 'com',
@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() {
this.categories = new Map(); this.categories = new Map();
this.dataFilters = new Map(); this.dataFilters = new Map();
this.filterParser.reset(); this.filterParser.reset();
this.urlTokenizer.resetKnownTokens();
// This will invalidate all tries // This will invalidate all tries
FilterHostnameDict.reset(); FilterHostnameDict.reset();
@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() {
const filterDataHolderId = FilterDataHolder.fid; const filterDataHolderId = FilterDataHolder.fid;
const redirectTypeValue = typeNameToTypeValue.redirect; const redirectTypeValue = typeNameToTypeValue.redirect;
const unserialize = µb.CompiledLineIO.unserialize; const unserialize = µb.CompiledLineIO.unserialize;
const knownTokens = this.urlTokenizer.knownTokens;
for ( const line of this.goodFilters ) { for ( const line of this.goodFilters ) {
if ( this.badFilters.has(line) ) { if ( this.badFilters.has(line) ) {
@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() {
entry.next = bucket; entry.next = bucket;
} }
this.dataFilters.set(tokenHash, entry); this.dataFilters.set(tokenHash, entry);
knownTokens[tokenHash & 0xFFFF] = 1;
continue; continue;
} }
@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() {
continue; continue;
} }
knownTokens[tokenHash & 0xFFFF] = 1;
if ( entry === undefined ) { if ( entry === undefined ) {
bucket.set(tokenHash, filterFromCompiledData(fdata)); bucket.set(tokenHash, filterFromCompiledData(fdata));
continue; continue;
@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) {
discardedCount: this.discardedCount, discardedCount: this.discardedCount,
categories: categoriesToSelfie(this.categories), categories: categoriesToSelfie(this.categories),
dataFilters: dataFiltersToSelfie(this.dataFilters), dataFilters: dataFiltersToSelfie(this.dataFilters),
urlTokenizer: this.urlTokenizer.toSelfie(),
}) })
) )
]); ]);
@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) {
this.allowFilterCount = selfie.allowFilterCount; this.allowFilterCount = selfie.allowFilterCount;
this.blockFilterCount = selfie.blockFilterCount; this.blockFilterCount = selfie.blockFilterCount;
this.discardedCount = selfie.discardedCount; this.discardedCount = selfie.discardedCount;
this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
for ( const [ catbits, bucket ] of selfie.categories ) { for ( const [ catbits, bucket ] of selfie.categories ) {
const tokenMap = new Map(); const tokenMap = new Map();
for ( const [ token, fdata ] of bucket ) { for ( const [ token, fdata ] of bucket ) {
@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out
toAdd = new Map(), toAdd = new Map(),
toRemove = new Map(); toRemove = new Map();
let tokenHashes = this.urlTokenizer.getTokens(), const tokenHashes = this.urlTokenizer.getTokens();
i = 0; let i = 0;
while ( i < 32 ) { while ( i < 32 ) {
let tokenHash = tokenHashes[i++]; let tokenHash = tokenHashes[i++];
if ( tokenHash === 0 ) { break; } if ( tokenHash === 0 ) { break; }

View File

@ -65,6 +65,9 @@
this._urlOut = ''; this._urlOut = '';
this._tokenized = false; this._tokenized = false;
this._tokens = [ 0 ]; this._tokens = [ 0 ];
this.knownTokens = new Uint8Array(65536);
this.resetKnownTokens();
} }
setURL(url) { setURL(url) {
@ -76,6 +79,15 @@
return this._urlOut; return this._urlOut;
} }
resetKnownTokens() {
this.knownTokens.fill(0);
this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
}
// Tokenize on demand. // Tokenize on demand.
getTokens() { getTokens() {
if ( this._tokenized ) { return this._tokens; } if ( this._tokenized ) { return this._tokens; }
@ -92,12 +104,6 @@
return this._tokens; return this._tokens;
} }
_appendTokenAt(i, th, ti) {
this._tokens[i+0] = th;
this._tokens[i+1] = ti;
return i + 2;
}
tokenHashFromString(s) { tokenHashFromString(s) {
const l = s.length; const l = s.length;
if ( l === 0 ) { return 0; } if ( l === 0 ) { return 0; }
@ -119,9 +125,26 @@
return s; return s;
} }
toSelfie() {
return µBlock.base64.encode(
this.knownTokens.buffer,
this.knownTokens.byteLength
);
}
fromSelfie(selfie) {
return µBlock.base64.decode(selfie, this.knownTokens.buffer);
}
// https://github.com/chrisaljoudi/uBlock/issues/1118 // https://github.com/chrisaljoudi/uBlock/issues/1118
// We limit to a maximum number of tokens. // We limit to a maximum number of tokens.
_appendTokenAt(i, th, ti) {
this._tokens[i+0] = th;
this._tokens[i+1] = ti;
return i + 2;
}
_tokenize() { _tokenize() {
const tokens = this._tokens; const tokens = this._tokens;
let url = this._urlOut; let url = this._urlOut;
@ -131,6 +154,7 @@
url = url.slice(0, 2048); url = url.slice(0, 2048);
l = 2048; l = 2048;
} }
const knownTokens = this.knownTokens;
const vtc = this._validTokenChars; const vtc = this._validTokenChars;
let i = 0, j = 0, v, n, ti, th; let i = 0, j = 0, v, n, ti, th;
for (;;) { for (;;) {
@ -148,9 +172,11 @@
th = th * 64 + v; th = th * 64 + v;
n += 1; n += 1;
} }
tokens[j+0] = th; if ( knownTokens[th & 0xFFFF] !== 0 ) {
tokens[j+1] = ti; tokens[j+0] = th;
j += 2; tokens[j+1] = ti;
j += 2;
}
} }
} }
})(); })();