diff --git a/src/js/background.js b/src/js/background.js index c08f91c81..6f537ff8a 100644 --- a/src/js/background.js +++ b/src/js/background.js @@ -141,8 +141,8 @@ const µBlock = (( ) => { // jshint ignore:line // Read-only systemSettings: { - compiledMagic: 21, // Increase when compiled format changes - selfieMagic: 22, // Increase when selfie format changes + compiledMagic: 23, // Increase when compiled format changes + selfieMagic: 23, // Increase when selfie format changes }, restoreBackupSettings: { diff --git a/src/js/document-blocked.js b/src/js/document-blocked.js index 4b9520901..858f2d80d 100644 --- a/src/js/document-blocked.js +++ b/src/js/document-blocked.js @@ -44,7 +44,6 @@ let details = {}; (async ( ) => { const response = await messaging.send('documentBlocked', { what: 'listsFromNetFilter', - compiledFilter: details.fc, rawFilter: details.fs, }); if ( response instanceof Object === false ) { return; } diff --git a/src/js/hntrie.js b/src/js/hntrie.js index d1c0db5fb..ad70becb7 100644 --- a/src/js/hntrie.js +++ b/src/js/hntrie.js @@ -560,6 +560,8 @@ HNTrieContainer.prototype.HNTrieRef = class { this.container = container; this.iroot = iroot; this.size = size; + this.needle = ''; + this.last = -1; } add(hn) { diff --git a/src/js/logger-ui.js b/src/js/logger-ui.js index 6736de2ed..11e8e9e68 100644 --- a/src/js/logger-ui.js +++ b/src/js/logger-ui.js @@ -659,7 +659,6 @@ const viewPort = (( ) => { } if ( filteringType === 'static' ) { divcl.add('canLookup'); - div.setAttribute('data-filter', filter.compiled); } else if ( filteringType === 'cosmetic' ) { divcl.add('canLookup'); divcl.toggle('isException', filter.raw.startsWith('#@#')); @@ -1465,7 +1464,6 @@ const reloadTab = function(ev) { const fillSummaryPaneFilterList = async function(rows) { const rawFilter = targetRow.children[1].textContent; - const compiledFilter = targetRow.getAttribute('data-filter'); const nodeFromFilter = function(filter, lists) { const fragment = document.createDocumentFragment(); @@ -1524,7 +1522,6 @@ const reloadTab = function(ev) { if ( targetRow.classList.contains('networkRealm') ) { const response = await messaging.send('loggerUI', { what: 'listsFromNetFilter', - compiledFilter: compiledFilter, rawFilter: rawFilter, }); handleResponse(response); diff --git a/src/js/messaging.js b/src/js/messaging.js index be2f64f82..9d6df2189 100644 --- a/src/js/messaging.js +++ b/src/js/messaging.js @@ -66,14 +66,18 @@ const onMessage = function(request, sender, callback) { case 'listsFromNetFilter': µb.staticFilteringReverseLookup.fromNetFilter( - request.compiledFilter, - request.rawFilter, - callback - ); + request.rawFilter + ).then(response => { + callback(response); + }); return; case 'listsFromCosmeticFilter': - µb.staticFilteringReverseLookup.fromCosmeticFilter(request, callback); + µb.staticFilteringReverseLookup.fromCosmeticFilter( + request + ).then(response => { + callback(response); + }); return; case 'reloadAllFilters': diff --git a/src/js/redirect-engine.js b/src/js/redirect-engine.js index c5720f346..4662ae2da 100644 --- a/src/js/redirect-engine.js +++ b/src/js/redirect-engine.js @@ -737,6 +737,9 @@ RedirectEngine.prototype.loadBuiltinResources = function() { store(name, reader.result); resolve(); }; + reader.onabort = reader.onerror = ( ) => { + resolve(); + }; reader.readAsDataURL(blob); }); }; diff --git a/src/js/reverselookup.js b/src/js/reverselookup.js index fe8cb048b..468b534ab 100644 --- a/src/js/reverselookup.js +++ b/src/js/reverselookup.js @@ -39,9 +39,9 @@ let messageId = 1; const onWorkerMessage = function(e) { const msg = e.data; - const callback = pendingResponses.get(msg.id); + const resolver = pendingResponses.get(msg.id); pendingResponses.delete(msg.id); - callback(msg.response); + resolver(msg.response); }; /******************************************************************************/ @@ -55,6 +55,9 @@ const stopWorker = function() { worker.terminate(); worker = null; needLists = true; + for ( const resolver of pendingResponses.values() ) { + resolver(); + } pendingResponses.clear(); }; @@ -127,36 +130,34 @@ const initWorker = function() { /******************************************************************************/ -const fromNetFilter = async function(compiledFilter, rawFilter, callback) { - if ( typeof callback !== 'function' ) { - return; - } +const fromNetFilter = async function(rawFilter) { + if ( typeof rawFilter !== 'string' || rawFilter === '' ) { return; } - if ( compiledFilter === '' || rawFilter === '' ) { - callback(); + const µb = µBlock; + const writer = new µb.CompiledLineIO.Writer(); + if ( µb.staticNetFilteringEngine.compile(rawFilter, writer) === false ) { return; } await initWorker(); const id = messageId++; - const message = { + worker.postMessage({ what: 'fromNetFilter', id: id, - compiledFilter: compiledFilter, + compiledFilter: writer.last(), rawFilter: rawFilter - }; - pendingResponses.set(id, callback); - worker.postMessage(message); + }); + + return new Promise(resolve => { + pendingResponses.set(id, resolve); + }); }; /******************************************************************************/ -const fromCosmeticFilter = async function(details, callback) { - if ( typeof callback !== 'function' ) { return; } - - if ( details.rawFilter === '' ) { - callback(); +const fromCosmeticFilter = async function(details) { + if ( typeof details.rawFilter !== 'string' || details.rawFilter === '' ) { return; } @@ -164,7 +165,7 @@ const fromCosmeticFilter = async function(details, callback) { const id = messageId++; const hostname = µBlock.URI.hostnameFromURI(details.url); - pendingResponses.set(id, callback); + worker.postMessage({ what: 'fromCosmeticFilter', id: id, @@ -182,6 +183,11 @@ const fromCosmeticFilter = async function(details, callback) { ) === 2, rawFilter: details.rawFilter }); + + return new Promise(resolve => { + pendingResponses.set(id, resolve); + }); + }; /******************************************************************************/ diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 855635b17..52170d825 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -31,6 +31,7 @@ /******************************************************************************/ const µb = µBlock; +const urlTokenizer = µb.urlTokenizer; // fedcba9876543210 // | | ||| @@ -160,11 +161,6 @@ const toNormalizedType = { const BlockImportant = BlockAction | Important; -const reIsWildcarded = /[\^\*]/; - -// ABP filters: https://adblockplus.org/en/filters -// regex tester: http://regex101.com/ - /******************************************************************************/ // See the following as short-lived registers, used during evaluation. They are @@ -173,64 +169,37 @@ const reIsWildcarded = /[\^\*]/; let $requestURL = ''; let $requestHostname = ''; let $docHostname = ''; +let $tokenBeg = 0; +let $patternMatchLeft = 0; +let $patternMatchRight = 0; -/******************************************************************************/ - -// First character of match must be within the hostname part of the url. -// -// https://github.com/gorhill/uBlock/issues/1929 -// Match only hostname label boundaries. - -const isHnAnchored = (( ) => { - let lastLen = 0, lastBeg = -1, lastEnd = -1; - - return (url, matchStart) => { - const len = $requestHostname.length; - if ( len !== lastLen || url.endsWith('://', lastBeg) === false ) { - lastBeg = len !== 0 ? url.indexOf('://') : -1; - if ( lastBeg !== -1 ) { - lastBeg += 3; - lastEnd = lastBeg + len; - } else { - lastEnd = -1; - } - lastLen = len; - } - return matchStart < lastEnd && ( - matchStart === lastBeg || - matchStart > lastBeg && - url.charCodeAt(matchStart - 1) === 0x2E /* '.' */ - ); - }; -})(); +// EXPERIMENT: $requestTypeBit +let $requestTypeBit = 0; /******************************************************************************/ // Local helpers -const normalizeRegexSource = function(s) { - try { - const re = new RegExp(s); - return re.source; - } catch (ex) { - normalizeRegexSource.message = ex.toString(); - } - return ''; -}; +const restrSeparator = '(?:[^%.0-9a-z_-]|$)'; -const rawToRegexStr = function(s, anchor) { - // https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/ - // https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions - // Also: remove leading/trailing wildcards -- there is no point. - let reStr = s.replace(rawToRegexStr.escape1, '\\$&') - .replace(rawToRegexStr.escape2, '(?:[^%.0-9a-z_-]|$)') - .replace(rawToRegexStr.escape3, '') - .replace(rawToRegexStr.escape4, '[^ ]*?'); +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions + +// Convert a plain string (devoid of special characters) into a regex. +const restrFromPlainPattern = function(s) { + return s.replace(restrFromPlainPattern.escape, '\\$&'); +}; +restrFromPlainPattern.escape = /[.*+?^${}()|[\]\\]/g; + +const restrFromGenericPattern = function(s, anchor = 0) { + let reStr = s.replace(restrFromGenericPattern.rePlainChars, '\\$&') + .replace(restrFromGenericPattern.reSeparators, restrSeparator) + .replace(restrFromGenericPattern.reDanglingAsterisks, '') + .replace(restrFromGenericPattern.reAsterisks, '\\S*?'); if ( anchor & 0b100 ) { reStr = ( reStr.startsWith('\\.') ? - rawToRegexStr.reTextHostnameAnchor2 : - rawToRegexStr.reTextHostnameAnchor1 + restrFromGenericPattern.restrHostnameAnchor2 : + restrFromGenericPattern.restrHostnameAnchor1 ) + reStr; } else if ( anchor & 0b010 ) { reStr = '^' + reStr; @@ -240,65 +209,133 @@ const rawToRegexStr = function(s, anchor) { } return reStr; }; -rawToRegexStr.escape1 = /[.+?${}()|[\]\\]/g; -rawToRegexStr.escape2 = /\^/g; -rawToRegexStr.escape3 = /^\*|\*$/g; -rawToRegexStr.escape4 = /\*/g; -rawToRegexStr.reTextHostnameAnchor1 = '^[a-z-]+://(?:[^/?#]+\\.)?'; -rawToRegexStr.reTextHostnameAnchor2 = '^[a-z-]+://(?:[^/?#]+)?'; +restrFromGenericPattern.rePlainChars = /[.+?${}()|[\]\\]/g; +restrFromGenericPattern.reSeparators = /\^/g; +restrFromGenericPattern.reDanglingAsterisks = /^\*+|\*+$/g; +restrFromGenericPattern.reAsterisks = /\*+/g; +restrFromGenericPattern.restrHostnameAnchor1 = '^[a-z-]+://(?:[^/?#]+\\.)?'; +restrFromGenericPattern.restrHostnameAnchor2 = '^[a-z-]+://(?:[^/?#]+)?'; -// https://github.com/uBlockOrigin/uAssets/issues/4083#issuecomment-436914727 -const rawToPlainStr = function(s, anchor) { - if ( - anchor === 0 && - s.charCodeAt(0) === 0x2F /* '/' */ && - s.length > 2 && - s.charCodeAt(s.length-1) === 0x2F /* '/' */ - ) { - s = s + '*'; - } - return s; -}; - -const filterDataSerialize = µb.CompiledLineIO.serialize; - -const toLogDataInternal = function(categoryBits, tokenHash, filter) { - if ( filter === null ) { return undefined; } - const logData = filter.logData(); - logData.compiled = filterDataSerialize([ - categoryBits, - tokenHash, - logData.compiled - ]); +const toLogDataInternal = function(categoryBits, tokenHash, iunit) { + if ( iunit === 0 ) { return; } + const pattern = []; + const regex = []; + const options = []; + const domains = []; + const logData = { pattern, regex, domains, options, isRegex: false }; + filterUnits[iunit].logData(logData); if ( categoryBits & 0x001 ) { - logData.raw = `@@${logData.raw}`; + logData.pattern.unshift('@@'); } - const opts = []; if ( categoryBits & 0x002 ) { - opts.push('important'); + logData.options.unshift('important'); } if ( categoryBits & 0x008 ) { - opts.push('3p'); + logData.options.unshift('3p'); } else if ( categoryBits & 0x004 ) { - opts.push('1p'); + logData.options.unshift('1p'); } const type = categoryBits & 0x1F0; if ( type !== 0 && type !== typeNameToTypeValue.data ) { - opts.push(typeValueToTypeName[type >>> 4]); + logData.options.unshift(typeValueToTypeName[type >>> 4]); } - if ( logData.opts !== undefined ) { - opts.push(logData.opts); + let raw = logData.pattern.join(''); + if ( + logData.isRegex === false && + raw.charCodeAt(0) === 0x2F /* '/' */ && + raw.charCodeAt(raw.length - 1) === 0x2F /* '/' */ + ) { + raw += '*'; } - if ( opts.length !== 0 ) { - logData.raw += '$' + opts.join(','); + if ( domains.length !== 0 ) { + options.push(`domain=${domains.join('|')}`); } - return logData; + if ( options.length !== 0 ) { + raw += '$' + options.join(','); + } + return { raw, regex: logData.regex.join('') }; +}; + +/******************************************************************************/ + +const charClassMap = new Uint32Array(128); +const CHAR_CLASS_SEPARATOR = 0b00000001; + +{ + const reSeparators = /[^\w%.-]/; + for ( let i = 0; i < 128; i++ ) { + if ( reSeparators.test(String.fromCharCode(i)) ) { + charClassMap[i] |= CHAR_CLASS_SEPARATOR; + } + } +} + +const isSeparatorChar = function(c) { + return (charClassMap[c] & CHAR_CLASS_SEPARATOR) !== 0; +}; + +/******************************************************************************/ + +let filterUnits = [ null ]; + +let filterSequences = new Uint32Array(131072); +let filterSequenceWritePtr = 3; + +const filterSequenceAdd = function(a, b) { + const i = filterSequenceWritePtr; + filterSequenceWritePtr += 2; + if ( filterSequenceWritePtr > filterSequences.length ) { + filterSequenceBufferResize(filterSequenceWritePtr); + } + filterSequences[i+0] = a; + filterSequences[i+1] = b; + return i; +}; + +const filterSequenceBufferResize = function(newSize) { + if ( newSize <= filterSequences.length ) { return; } + const size = (newSize + 0x3FFF) & ~0x3FFF; + const buffer = new Uint32Array(size); + buffer.set(filterSequences); + filterSequences = buffer; +}; + +/******************************************************************************/ + +const bidiTrieMatchExtra = function(l, r, ix) { + for (;;) { + $patternMatchLeft = l; + $patternMatchRight = r; + const iu = filterSequences[ix+0]; + if ( filterUnits[iu].match() ) { return iu; } + ix = filterSequences[ix+1]; + if ( ix === 0 ) { break; } + } + return 0; +}; + +const bidiTrie = (( ) => { + let trieDetails; + try { + trieDetails = JSON.parse( + vAPI.localStorage.getItem('SNFE.bidiTrieDetails') + ); + } catch(ex) { + } + return new µb.BidiTrieContainer(trieDetails, bidiTrieMatchExtra); +})(); + +const bidiTrieOptimize = function() { + const trieDetails = bidiTrie.optimize(); + vAPI.localStorage.setItem( + 'SNFE.bidiTrieDetails', + JSON.stringify(trieDetails) + ); }; /******************************************************************************* - Each filter class will register itself in the map. A filter class - id MUST always stringify to ONE single character. + Each filter class will register itself in the map. IMPORTANT: any change which modifies the mapping will have to be reflected with µBlock.systemSettings.compiledMagic. @@ -309,13 +346,114 @@ const filterClasses = []; let filterClassIdGenerator = 0; const registerFilterClass = function(ctor) { - let fid = filterClassIdGenerator++; + const fid = filterClassIdGenerator++; ctor.fid = ctor.prototype.fid = fid; filterClasses[fid] = ctor; }; -const filterFromCompiledData = function(args) { - return filterClasses[args[0]].load(args); +const filterFromCtor = function(ctor, ...args) { + if ( ctor.filterUnit !== undefined ) { + return ctor.filterUnit; + } + const f = new ctor(...args); + const iunit = filterUnits.length; + filterUnits.push(f); + return iunit; +}; + +const filterUnitFromCompiled = function(args) { + const ctor = filterClasses[args[0]]; + return ctor.unitFromCompiled(args); +}; + +const filterFromSelfie = function(args) { + return filterClasses[args[0]].fromSelfie(args); +}; + +/******************************************************************************/ + +const filterPattern = { + compile: function(parsed, units) { + if ( parsed.isRegex ) { + units.push(FilterRegex.compile(parsed)); + return; + } + const pattern = parsed.f; + if ( pattern === '*' ) { + units.push(FilterTrue.compile()); + return; + } + if ( parsed.tokenHash === parsed.noTokenHash ) { + units.push(FilterPatternGeneric.compile(parsed)); + return; + } + if ( parsed.firstWildcardPos === -1 && parsed.firstCaretPos === -1 ) { + units.push(FilterPatternPlain.compile(parsed)); + return; + } + if ( + parsed.secondWildcardPos !== -1 || + parsed.secondCaretPos !== -1 || + parsed.firstCaretPos !== -1 && ( + parsed.firstWildcardPos === -1 || + parsed.firstWildcardPos !== (parsed.firstCaretPos + 1) + ) + ) { + return this.compileGeneric(parsed, units); + } + const hasCaretCombo = parsed.firstCaretPos !== -1; + const sright = pattern.slice(parsed.firstWildcardPos + 1); + const sleft = pattern.slice( + 0, + hasCaretCombo ? parsed.firstCaretPos : parsed.firstWildcardPos + ); + if ( parsed.tokenBeg < parsed.firstWildcardPos ) { + parsed.f = sleft; + units.push(FilterPatternPlain.compile(parsed)); + parsed.f = sright; + units.push(FilterPatternRight.compile(parsed, hasCaretCombo)); + return; + } + // parsed.tokenBeg > parsed.firstWildcardPos + parsed.f = sright; + parsed.tokenBeg -= parsed.firstWildcardPos + 1; + units.push(FilterPatternPlain.compile(parsed)); + parsed.f = sleft; + units.push(FilterPatternLeft.compile(parsed, hasCaretCombo)); + }, + compileGeneric: function(parsed, units) { + const pattern = parsed.f; + // Optimize special case: plain pattern with trailing caret + if ( + parsed.firstWildcardPos === -1 && + parsed.firstCaretPos === (pattern.length - 1) + ) { + parsed.f = pattern.slice(0, -1); + units.push(FilterPatternPlain.compile(parsed)); + units.push(FilterTrailingSeparator.compile()); + return; + } + // Use a plain pattern as a first test for whether the generic pattern + // needs to be matched. + // TODO: inconclusive, investigate more. + //let left = parsed.tokenBeg; + //while ( left > 0 ) { + // const c = pattern.charCodeAt(left-1); + // if ( c === 0x2A /* '*' */ || c === 0x5E /* '^' */ ) { break; } + // left -= 1; + //} + //let right = parsed.tokenBeg + parsed.token.length; + //while ( right < pattern.length ) { + // const c = pattern.charCodeAt(right); + // if ( c === 0x2A /* '*' */ || c === 0x5E /* '^' */ ) { break; } + // right += 1; + //} + //parsed.f = pattern.slice(left, right); + //parsed.tokenBeg -= left; + //units.push(FilterPatternPlain.compile(parsed)); + //parsed.f = pattern; + units.push(FilterPatternGeneric.compile(parsed)); + }, }; /******************************************************************************/ @@ -325,132 +463,385 @@ const FilterTrue = class { return true; } - logData() { - return { - raw: '*', - regex: '^', - compiled: this.compile(), - }; + logData(details) { + details.pattern.push('*'); + details.regex.push('^'); } - compile() { - return [ this.fid ]; + toSelfie() { + return FilterTrue.compile(); } static compile() { return [ FilterTrue.fid ]; } - static load() { + static unitFromCompiled() { + return FilterTrue.filterUnit; + } + + static fromSelfie() { return FilterTrue.instance; } }; FilterTrue.instance = new FilterTrue(); +FilterTrue.filterUnit = filterUnits.push(FilterTrue.instance) - 1; registerFilterClass(FilterTrue); /******************************************************************************/ -const FilterPlain = class { - constructor(s) { - this.s = s; +const FilterPatternPlain = class { + constructor(i, n) { + this.i = i; + this.n = n; } - match(url, tokenBeg) { - return url.startsWith(this.s, tokenBeg); - } - - logData() { - return { - raw: rawToPlainStr(this.s, 0), - regex: rawToRegexStr(this.s, 0), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s, this.tokenBeg ]; - } - - addToTrie(trie) { - if ( this.s.length > 255 ) { return false; } - trie.add(this.s, this.tokenBeg); + match() { + const left = $tokenBeg; + if ( bidiTrie.startsWith(left, this.i, this.n) === false ) { + return false; + } + $patternMatchLeft = left; + $patternMatchRight = left + this.n; return true; } + get isBidiTrieable() { + return this.n <= 255; + } + + toBidiTrie() { + return { i: this.i, n: this.n, itok: this.tokenBeg }; + } + + logData(details) { + const s = bidiTrie.extractString(this.i, this.n); + details.pattern.push(s); + details.regex.push(restrFromPlainPattern(s)); + } + + toSelfie() { + return [ this.fid, this.i, this.n, this.tokenBeg ]; + } + static compile(details) { - return [ FilterPlain.fid, details.f, details.tokenBeg ]; + return [ FilterPatternPlain.fid, details.f, details.tokenBeg ]; } - static load(args) { + static unitFromCompiled(args) { + const i = bidiTrie.storeString(args[1]); + const n = args[1].length; + let f; if ( args[2] === 0 ) { - return new FilterPlain(args[1]); + f = new FilterPatternPlain(i, n); + } else if ( args[2] === 1 ) { + f = new FilterPatternPlain1(i, n); + } else { + f = new FilterPatternPlainX(i, n, args[2]); } - if ( args[2] === 1 ) { - return new FilterPlain1(args[1]); - } - return new FilterPlainX(args[1], args[2]); + return filterUnits.push(f) - 1; } - static addToTrie(args, trie) { - if ( args[1].length > 255 ) { return false; } - trie.add(args[1], args[2]); + static fromSelfie(args) { + if ( args[3] === 0 ) { + return new FilterPatternPlain(args[1], args[2]); + } + if ( args[3] === 1 ) { + return new FilterPatternPlain1(args[1], args[2]); + } + return new FilterPatternPlainX(args[1], args[2], args[3]); + } +}; + +FilterPatternPlain.prototype.tokenBeg = 0; + +registerFilterClass(FilterPatternPlain); + + +const FilterPatternPlain1 = class extends FilterPatternPlain { + match() { + const left = $tokenBeg - 1; + if ( bidiTrie.startsWith(left, this.i, this.n) === false ) { + return false; + } + $patternMatchLeft = left; + $patternMatchRight = left + this.n; return true; } }; -FilterPlain.trieableId = 0; -FilterPlain.prototype.trieableId = FilterPlain.trieableId; -FilterPlain.prototype.tokenBeg = 0; - -registerFilterClass(FilterPlain); +FilterPatternPlain1.prototype.tokenBeg = 1; -const FilterPlain1 = class extends FilterPlain { - match(url, tokenBeg) { - return url.startsWith(this.s, tokenBeg - 1); - } -}; - -FilterPlain1.prototype.tokenBeg = 1; - - -const FilterPlainX = class extends FilterPlain { - constructor(s, tokenBeg) { - super(s); +const FilterPatternPlainX = class extends FilterPatternPlain { + constructor(i, n, tokenBeg) { + super(i, n); this.tokenBeg = tokenBeg; } - match(url, tokenBeg) { - return url.startsWith(this.s, tokenBeg - this.tokenBeg); + match() { + const left = $tokenBeg - this.tokenBeg; + if ( bidiTrie.startsWith(left, this.i, this.n) === false ) { + return false; + } + $patternMatchLeft = left; + $patternMatchRight = left + this.n; + return true; } }; /******************************************************************************/ +const FilterPatternLeft = class { + constructor(i, n) { + this.i = i; + this.n = n; + } + + match() { + const left = bidiTrie.indexOf( + 0, $patternMatchLeft, + this.i, this.n + ); + if ( left === -1 ) { return false; } + $patternMatchLeft = left; + return true; + } + + logData(details) { + const s = bidiTrie.extractString(this.i, this.n); + details.pattern.unshift(s, '*'); + details.regex.unshift(restrFromPlainPattern(s), '.*'); + } + + toSelfie() { + return [ this.fid, this.i, this.n ]; + } + + static compile(details, ex) { + return [ + ex ? FilterPatternLeftEx.fid : FilterPatternLeft.fid, + details.f + ]; + } + + static unitFromCompiled(args) { + const i = bidiTrie.storeString(args[1]); + const f = new FilterPatternLeft(i, args[1].length); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterPatternLeft(args[1], args[2]); + } +}; + +registerFilterClass(FilterPatternLeft); + + +const FilterPatternLeftEx = class extends FilterPatternLeft { + match() { + let left = 0; + for (;;) { + left = bidiTrie.indexOf( + left, $patternMatchLeft - 1, + this.i, this.n + ); + if ( left === -1 ) { return false; } + if ( isSeparatorChar(bidiTrie.haystack[left + this.n]) ) { + break; + } + left += 1; + } + $patternMatchLeft = left; + return true; + } + + logData(details) { + const s = bidiTrie.extractString(this.i, this.n); + details.pattern.unshift(s, '^*'); + details.regex.unshift(restrFromPlainPattern(s), restrSeparator, '.*'); + } + + static unitFromCompiled(args) { + const i = bidiTrie.storeString(args[1]); + const f = new FilterPatternLeftEx(i, args[1].length); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterPatternLeftEx(args[1], args[2]); + } +}; + +registerFilterClass(FilterPatternLeftEx); + +/******************************************************************************/ + +const FilterPatternRight = class { + constructor(i, n) { + this.i = i; + this.n = n; + } + + match() { + const right = bidiTrie.lastIndexOf( + $patternMatchRight, bidiTrie.haystackSize, + this.i, this.n + ); + if ( right === -1 ) { return false; } + $patternMatchRight = right + this.n; + return true; + } + + logData(details) { + const s = bidiTrie.extractString(this.i, this.n); + details.pattern.push('*', s); + details.regex.push('.*', restrFromPlainPattern(s)); + } + + toSelfie() { + return [ this.fid, this.i, this.n ]; + } + + static compile(details, ex) { + return [ + ex ? FilterPatternRightEx.fid : FilterPatternRight.fid, + details.f + ]; + } + + static unitFromCompiled(args) { + const i = bidiTrie.storeString(args[1]); + const f = new FilterPatternRight(i, args[1].length); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterPatternRight(args[1], args[2]); + } +}; + +registerFilterClass(FilterPatternRight); + + +const FilterPatternRightEx = class extends FilterPatternRight { + match() { + const left = $patternMatchRight; + const right = bidiTrie.lastIndexOf( + left + 1, bidiTrie.haystackSize, + this.i, this.n + ); + if ( right === -1 ) { return false; } + if ( isSeparatorChar(bidiTrie.haystack[left]) === false ) { + return false; + } + $patternMatchRight = right + this.n; + return true; + } + + logData(details) { + const s = bidiTrie.extractString(this.i, this.n); + details.pattern.push('^*', s); + details.regex.push(restrSeparator, '.*', restrFromPlainPattern(s)); + } + + static unitFromCompiled(args) { + const i = bidiTrie.storeString(args[1]); + const f = new FilterPatternRightEx(i, args[1].length); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterPatternRightEx(args[1], args[2]); + } +}; + +registerFilterClass(FilterPatternRightEx); + +/******************************************************************************/ + +const FilterPatternGeneric = class { + constructor(s, anchor) { + this.s = s; + if ( anchor !== 0 ) { + this.anchor = anchor; + } + } + + match() { + if ( this.re === null ) { + this.re = new RegExp(restrFromGenericPattern(this.s, this.anchor)); + } + return this.re.test($requestURL); + } + + logData(details) { + details.pattern.length = 0; + if ( (this.anchor & 0b100) !== 0 ) { + details.pattern.push('||'); + } else if ( (this.anchor & 0b010) !== 0 ) { + details.pattern.push('|'); + } + details.pattern.push(this.s); + if ( (this.anchor & 0b001) !== 0 ) { + details.pattern.push('|'); + } + details.regex.length = 0; + details.regex.push( + restrFromGenericPattern(this.s, this.anchor & ~0b100) + ); + } + + toSelfie() { + return [ this.fid, this.s, this.anchor ]; + } + + static compile(details) { + const anchor = details.anchor; + details.anchor = 0; + return [ FilterPatternGeneric.fid, details.f, anchor ]; + } + + static unitFromCompiled(args) { + const f = new FilterPatternGeneric(args[1], args[2]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterPatternGeneric(args[1], args[2]); + } +}; + +FilterPatternGeneric.prototype.re = null; +FilterPatternGeneric.prototype.anchor = 0; + +FilterPatternGeneric.isSlow = true; + +registerFilterClass(FilterPatternGeneric); + +/******************************************************************************/ + const FilterPlainHostname = class { constructor(s) { this.s = s; } match() { - const haystack = $requestHostname; - const needle = this.s; - if ( haystack.endsWith(needle) === false ) { return false; } - const offset = haystack.length - needle.length; - return offset === 0 || haystack.charCodeAt(offset - 1) === 0x2E /* '.' */; + if ( $requestHostname.endsWith(this.s) === false ) { return false; } + const offset = $requestHostname.length - this.s.length; + return offset === 0 || + $requestHostname.charCodeAt(offset - 1) === 0x2E /* '.' */; } - logData() { - return { - raw: `||${this.s}^`, - regex: rawToRegexStr(`${this.s}^`, 0), - compiled: this.compile() - }; + logData(details) { + details.pattern.push('||', this.s, '^'); + details.regex.push(restrFromPlainPattern(this.s), restrSeparator); } - compile() { + toSelfie() { return [ this.fid, this.s ]; } @@ -458,7 +849,12 @@ const FilterPlainHostname = class { return [ FilterPlainHostname.fid, details.f ]; } - static load(args) { + static unitFromCompiled(args) { + const f = new FilterPlainHostname(args[1]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { return new FilterPlainHostname(args[1]); } }; @@ -467,531 +863,264 @@ registerFilterClass(FilterPlainHostname); /******************************************************************************/ -const FilterPlainLeftAnchored = class { - constructor(s) { - this.s = s; +const FilterAnchorHn = class { + constructor() { + this.lastLen = 0; + this.lastBeg = -1; + this.lastEnd = -1; } - match(url) { - return url.startsWith(this.s); - } - - logData() { - return { - raw: `|${this.s}`, - regex: rawToRegexStr(this.s, 0b010), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s ]; - } - - static compile(details) { - return [ FilterPlainLeftAnchored.fid, details.f ]; - } - - static load(args) { - return new FilterPlainLeftAnchored(args[1]); - } -}; - -registerFilterClass(FilterPlainLeftAnchored); - -/******************************************************************************/ - -const FilterPlainRightAnchored = class { - constructor(s) { - this.s = s; - } - - match(url) { - return url.endsWith(this.s); - } - - logData() { - return { - raw: `${this.s}|`, - regex: rawToRegexStr(this.s, 0b001), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s ]; - } - - static compile(details) { - return [ FilterPlainRightAnchored.fid, details.f ]; - } - - static load(args) { - return new FilterPlainRightAnchored(args[1]); - } -}; - -registerFilterClass(FilterPlainRightAnchored); - -/******************************************************************************/ - -const FilterExactMatch = class { - constructor(s) { - this.s = s; - } - - match(url) { - return url === this.s; - } - - logData() { - return { - raw: `|${this.s}|`, - regex: rawToRegexStr(this.s, 0b011), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s ]; - } - - static compile(details) { - return [ FilterExactMatch.fid, details.f ]; - } - - static load(args) { - return new FilterExactMatch(args[1]); - } -}; - -registerFilterClass(FilterExactMatch); - -/******************************************************************************/ - -const FilterPlainHnAnchored = class { - constructor(s) { - this.s = s; - } - - match(url, tokenBeg) { - return url.startsWith(this.s, tokenBeg) && - isHnAnchored(url, tokenBeg); - } - - logData() { - return { - raw: `||${this.s}`, - regex: rawToRegexStr(this.s, this.tokenBeg), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s, this.tokenBeg ]; - } - - addToTrie(trie) { - if ( this.s.length > 255 ) { return false; } - trie.add(this.s, this.tokenBeg); - return true; - } - - static compile(details) { - return [ FilterPlainHnAnchored.fid, details.f, details.tokenBeg ]; - } - - static load(args) { - if ( args[2] === 0 ) { - return new FilterPlainHnAnchored(args[1]); - } - return new FilterPlainHnAnchoredX(args[1], args[2]); - } - - static addToTrie(args, trie) { - if ( args[1].length > 255 ) { return false; } - trie.add(args[1], args[2]); - return true; - } -}; - -FilterPlainHnAnchored.trieableId = 1; -FilterPlainHnAnchored.prototype.trieableId = FilterPlainHnAnchored.trieableId; -FilterPlainHnAnchored.prototype.tokenBeg = 0; - -registerFilterClass(FilterPlainHnAnchored); - - -const FilterPlainHnAnchoredX = class extends FilterPlainHnAnchored { - constructor(s, tokenBeg) { - super(s); - this.tokenBeg = tokenBeg; - } - - match(url, tokenBeg) { - const beg = tokenBeg - this.tokenBeg; - return url.startsWith(this.s, beg) && isHnAnchored(url, beg); - } -}; - -/******************************************************************************* - - Filters with only one single occurrence of wildcard `*` - -*/ - -const FilterWildcard1 = class { - constructor(s0, s1, tokenBeg) { - this.s0 = s0; - this.s1 = s1; - this.tokenBeg = tokenBeg; - } - - match(url, tokenBeg) { - if ( this.tokenBeg >= 0 ) { - const s0Beg = tokenBeg - this.tokenBeg; - return s0Beg >= 0 && - url.startsWith(this.s0, s0Beg) && - url.indexOf(this.s1, s0Beg + this.s0.length) !== -1; - } - const s1Beg = tokenBeg + this.tokenBeg; - return s1Beg > 0 && - url.startsWith(this.s1, s1Beg) && - url.lastIndexOf(this.s0, s1Beg) !== -1; - } - - logData() { - return { - raw: `${this.s0}*${this.s1}`, - regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s0, this.s1, this.tokenBeg ]; - } - - static compile(details) { - if ( details.token === '*' ) { return; } - if ( details.anchor !== 0 ) { return; } - const s = details.f; - let pos = s.indexOf('*'); - if ( pos === -1 ) { return; } - if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; } - if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; } - return [ - FilterWildcard1.fid, - s.slice(0, pos), - s.slice(pos + 1), - details.tokenBeg < pos - ? details.tokenBeg - : pos + 1 - details.tokenBeg, - ]; - } - - static load(args) { - return new FilterWildcard1(args[1], args[2], args[3]); - } -}; - -registerFilterClass(FilterWildcard1); - -/******************************************************************************/ - -const FilterGeneric = class { - constructor(s, anchor) { - this.s = s; - this.anchor = anchor; - } - - match(url) { - if ( this.re === null ) { - this.re = new RegExp(rawToRegexStr(this.s, this.anchor)); - } - return this.re.test(url); - } - - logData() { - const out = { - raw: rawToPlainStr(this.s, this.anchor), - regex: this.re.source, - compiled: this.compile() - }; - if ( this.anchor & 0x2 ) { - out.raw = `|${out.raw}`; - } - if ( this.anchor & 0x1 ) { - out.raw += '|'; - } - return out; - } - - compile() { - return [ this.fid, this.s, this.anchor ]; - } - - static compile(details) { - const compiled = FilterWildcard1.compile(details); - if ( compiled !== undefined ) { return compiled; } - return [ FilterGeneric.fid, details.f, details.anchor ]; - } - - static load(args) { - return new FilterGeneric(args[1], args[2]); - } -}; - -FilterGeneric.prototype.re = null; - -registerFilterClass(FilterGeneric); - -/******************************************************************************* - - Hostname-anchored filters with only one occurrence of wildcard `*` - -*/ - -const FilterWildcard1HnAnchored = class { - constructor(s0, s1, tokenBeg) { - this.s0 = s0; - this.s1 = s1; - this.tokenBeg = tokenBeg; - } - - match(url, tokenBeg) { - if ( this.tokenBeg >= 0 ) { - const s0Beg = tokenBeg - this.tokenBeg; - return s0Beg >= 0 && - url.startsWith(this.s0, s0Beg) && - isHnAnchored(url, s0Beg) && - url.indexOf(this.s1, s0Beg + this.s0.length) !== -1; - } - const s1Beg = tokenBeg + this.tokenBeg; - if ( s1Beg < 0 || url.startsWith(this.s1, s1Beg) === false ) { - return false; - } - const s0Beg = url.lastIndexOf(this.s0, s1Beg); - return s0Beg !== -1 && isHnAnchored(url, s0Beg); - } - - logData() { - return { - raw: `||${this.s0}*${this.s1}`, - regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0), - compiled: this.compile() - }; - } - - compile() { - return [ this.fid, this.s0, this.s1, this.tokenBeg ]; - } - - static compile(details) { - if ( details.token === '*' ) { return; } - if ( (details.anchor & 0x0b001) !== 0 ) { return; } - const s = details.f; - let pos = s.indexOf('*'); - if ( pos === -1 ) { return; } - if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; } - const needSeparator = - pos !== 0 && s.charCodeAt(pos - 1) === 0x5E /* '^' */; - if ( needSeparator ) { pos -= 1; } - if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; } - if ( needSeparator ) { - return FilterWildcard2HnAnchored.compile(details, pos); - } - return [ - FilterWildcard1HnAnchored.fid, - s.slice(0, pos), - s.slice(pos + 1), - details.tokenBeg < pos - ? details.tokenBeg - : pos + 1 - details.tokenBeg, - ]; - } - - static load(args) { - return new FilterWildcard1HnAnchored(args[1], args[2], args[3]); - } -}; - -registerFilterClass(FilterWildcard1HnAnchored); - -/******************************************************************************* - - Hostname-anchored filters with one occurrence of the wildcard - sequence `^*` and no other wildcard-equivalent character - -*/ - -const FilterWildcard2HnAnchored = class { - constructor(s0, s1, tokenBeg) { - this.s0 = s0; - this.s1 = s1; - this.tokenBeg = tokenBeg; - } - - match(url, tokenBeg) { - let s0End, s1Beg; - if ( this.tokenBeg >= 0 ) { - const s0Beg = tokenBeg - this.tokenBeg; - if ( s0Beg < 0 || url.startsWith(this.s0, s0Beg) === false ) { - return false; + match() { + const len = $requestHostname.length; + const haystackCodes = bidiTrie.haystack; + if ( + len !== this.lastLen || + this.lastBeg === -1 || + haystackCodes[this.lastBeg-3] !== 0x3A /* ':' */ || + haystackCodes[this.lastBeg-2] !== 0x2F /* '/' */ || + haystackCodes[this.lastBeg-1] !== 0x2F /* '/' */ + ) { + this.lastBeg = len !== 0 ? haystackCodes.indexOf(0x3A) : -1; + if ( this.lastBeg !== -1 ) { + if ( + this.lastBeg >= bidiTrie.haystackSize || + haystackCodes[this.lastBeg+1] !== 0x2F || + haystackCodes[this.lastBeg+2] !== 0x2F + ) { + this.lastBeg = -1; + } } - if ( isHnAnchored(url, s0Beg) === false ) { return false; } - s0End = s0Beg + this.s0.length; - s1Beg = url.indexOf(this.s1, s0End); - if ( s1Beg === -1 ) { return false; } - } else { - s1Beg = tokenBeg + this.tokenBeg; - if ( s1Beg < 0 || url.startsWith(this.s1, s1Beg) === false ) { - return false; + if ( this.lastBeg !== -1 ) { + this.lastBeg += 3; + this.lastEnd = this.lastBeg + len; + } else { + this.lastEnd = -1; } - const s0Beg = url.lastIndexOf(this.s0, s1Beg); - if ( s0Beg === -1 || isHnAnchored(url, s0Beg) === false ) { - return false; - } - s0End = s0Beg + this.s0.length; + this.lastLen = len; } - return this.reSeparators.test(url.slice(s0End, s1Beg)); + const left = $patternMatchLeft; + return left < this.lastEnd && ( + left === this.lastBeg || + left > this.lastBeg && haystackCodes[left-1] === 0x2E /* '.' */ + ); } - logData() { - return { - raw: `||${this.s0}^*${this.s1}`, - regex: rawToRegexStr(`${this.s0}^*${this.s1}`, 0), - compiled: this.compile() - }; + logData(details) { + details.pattern.unshift('||'); } - compile() { - return [ this.fid, this.s0, this.s1, this.tokenBeg ]; + toSelfie() { + return [ this.fid ]; } - static compile(details, pos) { - return [ - FilterWildcard2HnAnchored.fid, - details.f.slice(0, pos), - details.f.slice(pos + 2), - details.tokenBeg < pos - ? details.tokenBeg - : pos + 2 - details.tokenBeg, - ]; + static compile() { + return [ FilterAnchorHn.fid ]; } - static load(args) { - return new FilterWildcard2HnAnchored(args[1], args[2], args[3]); + static unitFromCompiled() { + return FilterAnchorHn.filterUnit; + } + + static fromSelfie() { + return FilterAnchorHn.instance; } }; -FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/; +FilterAnchorHn.instance = new FilterAnchorHn(); +FilterAnchorHn.filterUnit = filterUnits.length; +filterUnits.push(FilterAnchorHn.instance); -registerFilterClass(FilterWildcard2HnAnchored); +registerFilterClass(FilterAnchorHn); /******************************************************************************/ -const FilterGenericHnAnchored = class { - constructor(s) { - this.s = s; +const FilterAnchorLeft = class { + match() { + return $patternMatchLeft === 0; } - match(url) { - if ( this.re === null ) { - this.re = new RegExp(rawToRegexStr(this.s, this.anchor)); - } - return this.re.test(url); + logData(details) { + details.pattern.unshift('|'); + details.regex.unshift('^'); } - logData() { - return { - raw: `||${this.s}`, - regex: rawToRegexStr(this.s, this.anchor & 0b001), - compiled: this.compile() - }; + toSelfie() { + return [ this.fid ]; } - compile() { - return [ this.fid, this.s ]; + static compile() { + return [ FilterAnchorLeft.fid ]; } - static compile(details) { - const compiled = FilterWildcard1HnAnchored.compile(details); - if ( compiled !== undefined ) { return compiled; } - return [ FilterGenericHnAnchored.fid, details.f ]; + static unitFromCompiled() { + return FilterAnchorLeft.filterUnit; } - static load(args) { - return new FilterGenericHnAnchored(args[1]); + static fromSelfie() { + return FilterAnchorLeft.instance; } }; -FilterGenericHnAnchored.prototype.re = null; -FilterGenericHnAnchored.prototype.anchor = 0x4; +FilterAnchorLeft.instance = new FilterAnchorLeft(); +FilterAnchorLeft.filterUnit = filterUnits.length; +filterUnits.push(FilterAnchorLeft.instance); -registerFilterClass(FilterGenericHnAnchored); +registerFilterClass(FilterAnchorLeft); /******************************************************************************/ -const FilterGenericHnAndRightAnchored = class extends FilterGenericHnAnchored { - logData() { - const out = super.logData(); - out.raw += '|'; - return out; +const FilterAnchorRight = class { + match() { + return $patternMatchRight === $requestURL.length; } - static compile(details) { - return [ FilterGenericHnAndRightAnchored.fid, details.f ]; + logData(details) { + details.pattern.push('|'); + details.regex.push('$'); } - static load(args) { - return new FilterGenericHnAndRightAnchored(args[1]); + toSelfie() { + return [ this.fid ]; + } + + static compile() { + return [ FilterAnchorRight.fid ]; + } + + static unitFromCompiled() { + return FilterAnchorRight.filterUnit; + } + + static fromSelfie() { + return FilterAnchorRight.instance; } }; -FilterGenericHnAndRightAnchored.prototype.anchor = 0x5; +FilterAnchorRight.instance = new FilterAnchorRight(); +FilterAnchorRight.filterUnit = filterUnits.length; +filterUnits.push(FilterAnchorRight.instance); -registerFilterClass(FilterGenericHnAndRightAnchored); +registerFilterClass(FilterAnchorRight); + +/******************************************************************************/ + +const FilterTrailingSeparator = class { + match() { + return $patternMatchRight === $requestURL.length || + isSeparatorChar(bidiTrie.haystack[$patternMatchRight]); + } + + logData(details) { + details.pattern.push('^'); + details.regex.push(restrSeparator); + } + + toSelfie() { + return [ this.fid ]; + } + + static compile() { + return [ FilterTrailingSeparator.fid ]; + } + + static unitFromCompiled() { + return FilterTrailingSeparator.filterUnit; + } + + static fromSelfie() { + return FilterTrailingSeparator.instance; + } +}; + +FilterTrailingSeparator.instance = new FilterTrailingSeparator(); +FilterTrailingSeparator.filterUnit = filterUnits.length; +filterUnits.push(FilterTrailingSeparator.instance); + +registerFilterClass(FilterTrailingSeparator); + +/******************************************************************************/ + +const FilterType = class { + constructor(bits) { + this.typeBits = bits; + } + + match() { + return (this.typeBits & $requestTypeBit) !== 0; + } + + logData() { + } + + toSelfie() { + return [ this.fid, this.typeBits ]; + } + + static compile(details) { + return [ FilterType.fid, details.typeBits & allNetworkTypesBits ]; + } + + static unitFromCompiled(args) { + const f = new FilterType(args[1]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterType(args[1]); + } +}; + +registerFilterClass(FilterType); /******************************************************************************/ const FilterRegex = class { constructor(s) { - this.re = s; + this.s = s; } - match(url) { - if ( typeof this.re === 'string' ) { - this.re = new RegExp(this.re, 'i'); + match() { + if ( this.re === null ) { + this.re = FilterRegex.dict.get(this.s); + if ( this.re === undefined ) { + this.re = new RegExp(this.s, 'i'); + FilterRegex.dict.set(this.s, this.re); + } } - return this.re.test(url); + if ( this.re.test($requestURL) === false ) { return false; } + $patternMatchLeft = $requestURL.search(this.re); + return true; } - logData() { - const s = typeof this.re === 'string' ? this.re : this.re.source; - return { - raw: `/${s}/`, - regex: s, - compiled: this.compile() - }; + logData(details) { + details.pattern.push('/', this.s, '/'); + details.regex.push(this.s); + details.isRegex = true; } - compile() { - return [ - this.fid, - typeof this.re === 'string' ? this.re : this.re.source - ]; + toSelfie() { + return [ this.fid, this.s ]; } static compile(details) { return [ FilterRegex.fid, details.f ]; } - static load(args) { + static unitFromCompiled(args) { + const f = new FilterRegex(args[1]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { return new FilterRegex(args[1]); } }; +FilterRegex.prototype.re = null; + +FilterRegex.isSlow = true; +FilterRegex.dict = new Map(); + registerFilterClass(FilterRegex); /******************************************************************************/ @@ -1008,69 +1137,83 @@ const filterOrigin = new (class { ); } catch(ex) { } - this.trieContainer = new µBlock.HNTrieContainer(trieDetails); - this.strSlots = []; - this.strToSlotId = new Map(); + this.trieContainer = new µb.HNTrieContainer(trieDetails); + this.strToUnitMap = new Map(); this.gcTimer = undefined; } - compile(details, wrapped) { + compile(details, prepend, units) { const domainOpt = details.domainOpt; + let compiledMiss, compiledHit; // One hostname if ( domainOpt.indexOf('|') === -1 ) { + // Must be a miss if ( domainOpt.charCodeAt(0) === 0x7E /* '~' */ ) { - return FilterOriginMiss.compile(domainOpt, wrapped); + compiledMiss = FilterOriginMiss.compile(domainOpt); + } + // Must be a hit + else { + compiledHit = FilterOriginHit.compile(domainOpt); } - return FilterOriginHit.compile(domainOpt, wrapped); } // Many hostnames. // Must be in set (none negated). - if ( domainOpt.indexOf('~') === -1 ) { - return FilterOriginHitSet.compile(domainOpt, wrapped); + else if ( domainOpt.indexOf('~') === -1 ) { + compiledHit = FilterOriginHitSet.compile(domainOpt); } // Must not be in set (all negated). - const reAllNegated = /^~(?:[^|~]+\|~)+[^|~]+$/; - if ( reAllNegated.test(domainOpt) ) { - return FilterOriginMissSet.compile(domainOpt, wrapped); + else if ( /^~(?:[^|~]+\|~)+[^|~]+$/.test(domainOpt) ) { + compiledMiss = FilterOriginMissSet.compile(domainOpt); } // Must be in one set, but not in the other. - return FilterOriginMixedSet.compile(domainOpt, wrapped); + else { + const hostnames = domainOpt.split('|'); + const missSet = hostnames.filter(hn => { + if ( hn.charCodeAt(0) === 0x7E /* '~' */ ) { + return hn; + } + }); + const hitSet = hostnames.filter(hn => { + if ( hn.charCodeAt(0) !== 0x7E /* '~' */ ) { + return hn; + } + }); + compiledMiss = missSet.length === 1 + ? FilterOriginMiss.compile(missSet[0]) + : FilterOriginMissSet.compile(missSet.join('|')); + compiledHit = hitSet.length === 1 + ? FilterOriginHit.compile(hitSet[0]) + : FilterOriginHitSet.compile(hitSet.join('|')); + } + if ( prepend ) { + if ( compiledHit ) { units.unshift(compiledHit); } + if ( compiledMiss ) { units.unshift(compiledMiss); } + } else { + if ( compiledMiss ) { units.push(compiledMiss); } + if ( compiledHit ) { units.push(compiledHit); } + } } - slotIdFromStr(s) { - let slotId = this.strToSlotId.get(s); - if ( slotId !== undefined ) { return slotId; } - slotId = this.strSlots.push(s) - 1; - this.strToSlotId.set(s, slotId); - if ( this.gcTimer !== undefined ) { return slotId; } - this.gcTimer = self.requestIdleCallback( + unitFromCompiled(ctor, s) { + let iunit = this.strToUnitMap.get(s); + if ( iunit !== undefined ) { return iunit; } + const f = new ctor(s); + iunit = filterUnits.push(f) - 1; + this.strToUnitMap.set(s, iunit); + if ( this.gcTimer !== undefined ) { return iunit; } + this.gcTimer = self.setTimeout( ( ) => { this.gcTimer = undefined; - this.strToSlotId.clear(); + this.strToUnitMap.clear(); }, - { timeout: 5000 } + 5000 ); - return slotId; - } - - strFromSlotId(slotId) { - return this.strSlots[slotId]; - } - - logData(out, domainOpt) { - if ( out.opts !== undefined ) { out.opts += ','; } - out.opts = `domain=${domainOpt}`; - return out; - } - - readyToUse() { - return this.trieContainer.readyToUse(); + return iunit; } reset() { this.trieContainer.reset(); - this.strSlots.length = 0; - this.strToSlotId.clear(); + this.strToUnitMap.clear(); } optimize() { @@ -1079,60 +1222,52 @@ const filterOrigin = new (class { 'FilterOrigin.trieDetails', JSON.stringify(trieDetails) ); - this.strToSlotId.clear(); + } + + toSelfie() { + } + + fromSelfie() { } })(); /******************************************************************************/ -// Surprinsingly, first peeking and comparing only the first character using -// charCodeAt() does help a bit performance -- 3-6µs gain per request on -// average for Chromium 71 and Firefox 65 with default lists. -// A likely explanation is that most visits are a miss, and in such case -// calling charCodeAt() to bail out earlier is cheaper than calling endsWith(). - const FilterOriginHit = class { - constructor(hostname, wrapped) { + constructor(hostname) { this.hostname = hostname; - this.wrapped = wrapped; } - match(url, tokenBeg) { + match() { const haystack = $docHostname; - const offset = haystack.length - this.hostname.length; + const needle = this.hostname; + const offset = haystack.length - needle.length; if ( offset < 0 ) { return false; } - if ( haystack.charCodeAt(offset) !== this.hostname.charCodeAt(0) ) { + if ( haystack.charCodeAt(offset) !== needle.charCodeAt(0) ) { return false; } - if ( haystack.endsWith(this.hostname) === false ) { return false; } - if ( - offset !== 0 && - haystack.charCodeAt(offset-1) !== 0x2E /* '.' */ - ) { - return false; - } - return this.wrapped.match(url, tokenBeg); + if ( haystack.endsWith(needle) === false ) { return false; } + return offset === 0 || haystack.charCodeAt(offset-1) === 0x2E /* '.' */; } - logData() { - const out = this.wrapped.logData(); - out.compiled = [ this.fid, this.hostname, out.compiled ]; - return filterOrigin.logData(out, this.hostname); + toSelfie() { + return [ this.fid, this.hostname ]; } - compile(toSelfie = false) { - return [ this.fid, this.hostname, this.wrapped.compile(toSelfie) ]; + logData(details) { + details.domains.push(this.hostname); } - static compile(domainOpt, wrapped) { - return [ FilterOriginHit.fid, domainOpt, wrapped ]; + static compile(domainOpt) { + return [ FilterOriginHit.fid, domainOpt ]; } - static load(args) { - return new FilterOriginHit( - args[1], - filterFromCompiledData(args[2]) - ); + static unitFromCompiled(args) { + return filterOrigin.unitFromCompiled(FilterOriginHit, args[1]); + } + + static fromSelfie(args) { + return new FilterOriginHit(args[1]); } }; @@ -1141,12 +1276,11 @@ registerFilterClass(FilterOriginHit); /******************************************************************************/ const FilterOriginMiss = class { - constructor(hostname, wrapped) { - this.hostname = hostname; - this.wrapped = wrapped; + constructor(hostname) { + this.hostname = hostname.slice(1); } - match(url, tokenBeg) { + match() { const haystack = $docHostname; if ( haystack.endsWith(this.hostname) ) { const offset = haystack.length - this.hostname.length; @@ -1157,28 +1291,27 @@ const FilterOriginMiss = class { return false; } } - return this.wrapped.match(url, tokenBeg); + return true; } - logData() { - const out = this.wrapped.logData(); - out.compiled = [ this.fid, this.hostname, out.compiled ]; - return filterOrigin.logData(out, `~${this.hostname}`); + logData(details) { + details.domains.push(`~${this.hostname}`); } - compile(toSelfie = false) { - return [ this.fid, this.hostname, this.wrapped.compile(toSelfie) ]; + toSelfie() { + return [ this.fid, `~${this.hostname}` ]; } - static compile(domainOpt, wrapped) { - return [ FilterOriginMiss.fid, domainOpt.slice(1), wrapped ]; + static compile(domainOpt) { + return [ FilterOriginMiss.fid, domainOpt ]; } - static load(args) { - return new FilterOriginMiss( - args[1], - filterFromCompiledData(args[2]) - ); + static unitFromCompiled(args) { + return filterOrigin.unitFromCompiled(FilterOriginMiss, args[1]); + } + + static fromSelfie(args) { + return new FilterOriginMiss(args[1]); } }; @@ -1187,53 +1320,46 @@ registerFilterClass(FilterOriginMiss); /******************************************************************************/ const FilterOriginHitSet = class { - constructor(domainOpt, wrapped, oneOf = null) { - this.domainOpt = typeof domainOpt === 'number' - ? domainOpt - : filterOrigin.slotIdFromStr(domainOpt); - this.wrapped = filterFromCompiledData(wrapped); + constructor(domainOpt, oneOf = null) { + this.domainOpt = domainOpt; this.oneOf = oneOf !== null ? filterOrigin.trieContainer.createOne(oneOf) : null; } - match(url, tokenBeg) { + match() { if ( this.oneOf === null ) { this.oneOf = filterOrigin.trieContainer.fromIterable( - filterOrigin.strFromSlotId(this.domainOpt).split('|') + this.domainOpt.split('|') ); } - return this.oneOf.matches($docHostname) !== -1 && - this.wrapped.match(url, tokenBeg); + return this.oneOf.matches($docHostname) !== -1; } - logData() { - const out = this.wrapped.logData(); - const domainOpt = filterOrigin.strFromSlotId(this.domainOpt); - out.compiled = [ this.fid, domainOpt, out.compiled ]; - return filterOrigin.logData(out, domainOpt); + logData(details) { + details.domains.push(this.domainOpt); } - compile(toSelfie = false) { - const out = [ + toSelfie() { + return [ this.fid, - toSelfie - ? this.domainOpt : - filterOrigin.strFromSlotId(this.domainOpt), - this.wrapped.compile(toSelfie), + this.domainOpt, + this.oneOf !== null + ? filterOrigin.trieContainer.compileOne(this.oneOf) + : null ]; - if ( this.oneOf !== null ) { - out.push(filterOrigin.trieContainer.compileOne(this.oneOf)); - } - return out; } - static compile(domainOpt, wrapped) { - return [ FilterOriginHitSet.fid, domainOpt, wrapped ]; + static compile(domainOpt) { + return [ FilterOriginHitSet.fid, domainOpt ]; } - static load(args) { - return new FilterOriginHitSet(...args.slice(1)); + static unitFromCompiled(args) { + return filterOrigin.unitFromCompiled(FilterOriginHitSet, args[1]); + } + + static fromSelfie(args) { + return new FilterOriginHitSet(args[1], args[2]); } }; @@ -1242,56 +1368,46 @@ registerFilterClass(FilterOriginHitSet); /******************************************************************************/ const FilterOriginMissSet = class { - constructor(domainOpt, wrapped, noneOf = null) { - this.domainOpt = typeof domainOpt === 'number' - ? domainOpt - : filterOrigin.slotIdFromStr(domainOpt); - this.wrapped = filterFromCompiledData(wrapped); + constructor(domainOpt, noneOf = null) { + this.domainOpt = domainOpt; this.noneOf = noneOf !== null ? filterOrigin.trieContainer.createOne(noneOf) : null; } - match(url, tokenBeg) { + match() { if ( this.noneOf === null ) { this.noneOf = filterOrigin.trieContainer.fromIterable( - filterOrigin - .strFromSlotId(this.domainOpt) - .replace(/~/g, '') - .split('|') + this.domainOpt.replace(/~/g, '').split('|') ); } - return this.noneOf.matches($docHostname) === -1 && - this.wrapped.match(url, tokenBeg); + return this.noneOf.matches($docHostname) === -1; } - logData() { - const out = this.wrapped.logData(); - const domainOpt = filterOrigin.strFromSlotId(this.domainOpt); - out.compiled = [ this.fid, domainOpt, out.compiled ]; - return filterOrigin.logData(out, domainOpt); + logData(details) { + details.domains.push(this.domainOpt); } - compile(toSelfie = false) { - const out = [ + toSelfie() { + return [ this.fid, - toSelfie - ? this.domainOpt - : filterOrigin.strFromSlotId(this.domainOpt), - this.wrapped.compile(toSelfie), + this.domainOpt, + this.noneOf !== null + ? filterOrigin.trieContainer.compileOne(this.noneOf) + : null ]; - if ( this.noneOf !== null ) { - out.push(filterOrigin.trieContainer.compileOne(this.noneOf)); - } - return out; } - static compile(domainOpt, wrapped) { - return [ FilterOriginMissSet.fid, domainOpt, wrapped ]; + static compile(domainOpt) { + return [ FilterOriginMissSet.fid, domainOpt ]; } - static load(args) { - return new FilterOriginMissSet(...args.slice(1)); + static unitFromCompiled(args) { + return filterOrigin.unitFromCompiled(FilterOriginMissSet, args[1]); + } + + static fromSelfie(args) { + return new FilterOriginMissSet(args[1], args[2]); } }; @@ -1299,128 +1415,53 @@ registerFilterClass(FilterOriginMissSet); /******************************************************************************/ -const FilterOriginMixedSet = class { - constructor(domainOpt, wrapped, oneOf = null, noneOf = null) { - this.domainOpt = typeof domainOpt === 'number' - ? domainOpt - : filterOrigin.slotIdFromStr(domainOpt); - this.wrapped = filterFromCompiledData(wrapped); - this.oneOf = oneOf !== null - ? filterOrigin.trieContainer.createOne(oneOf) - : null; - this.noneOf = noneOf !== null - ? filterOrigin.trieContainer.createOne(noneOf) - : null; - } - - init() { - const oneOf = [], noneOf = []; - const domainOpt = filterOrigin.strFromSlotId(this.domainOpt); - for ( const hostname of domainOpt.split('|') ) { - if ( hostname.charCodeAt(0) === 0x7E /* '~' */ ) { - noneOf.push(hostname.slice(1)); - } else { - oneOf.push(hostname); - } - } - this.oneOf = filterOrigin.trieContainer.fromIterable(oneOf); - this.noneOf = filterOrigin.trieContainer.fromIterable(noneOf); - } - - match(url, tokenBeg) { - if ( this.oneOf === null ) { this.init(); } - let needle = $docHostname; - return this.oneOf.matches(needle) !== -1 && - this.noneOf.matches(needle) === -1 && - this.wrapped.match(url, tokenBeg); - } - - logData() { - const out = this.wrapped.logData(); - const domainOpt = filterOrigin.strFromSlotId(this.domainOpt); - out.compiled = [ this.fid, domainOpt, out.compiled ]; - return filterOrigin.logData(out, domainOpt); - } - - compile(toSelfie = false) { - const out = [ - this.fid, - toSelfie - ? this.domainOpt - : filterOrigin.strFromSlotId(this.domainOpt), - this.wrapped.compile(toSelfie), - ]; - if ( this.oneOf !== null ) { - out.push( - filterOrigin.trieContainer.compileOne(this.oneOf), - filterOrigin.trieContainer.compileOne(this.noneOf) - ); - } - return out; - } - - static compile(domainOpt, wrapped) { - return [ FilterOriginMixedSet.fid, domainOpt, wrapped ]; - } - - static load(args) { - return new FilterOriginMixedSet(...args.slice(1)); - } -}; - -registerFilterClass(FilterOriginMixedSet); - -/******************************************************************************/ - const FilterDataHolder = class { constructor(dataType, data) { this.dataType = dataType; this.data = data; - this.wrapped = undefined; } - match(url, tokenBeg) { - return this.wrapped.match(url, tokenBeg); + match() { + return true; } - matchAndFetchData(type, url, tokenBeg, out) { - if ( this.dataType === type && this.match(url, tokenBeg) ) { + matchAndFetchData(type, out) { + if ( this.dataType !== type ) { return false; } + if ( Array.isArray(out) ) { out.push(this); } + return true; + } + + getData(type) { + if ( type === this.dataType ) { + return this.data; + } } - logData() { - const out = this.wrapped.logData(); - out.compiled = [ this.fid, this.dataType, this.data, out.compiled ]; + logData(details) { let opt = this.dataType; if ( this.data !== '' ) { opt += `=${this.data}`; } - if ( out.opts === undefined ) { - out.opts = opt; - } else { - out.opts = opt + ',' + out.opts; - } - return out; + details.options.push(opt); } - compile(toSelfie = false) { - return [ - this.fid, - this.dataType, - this.data, - this.wrapped.compile(toSelfie) - ]; + toSelfie() { + return [ this.fid, this.dataType, this.data ]; } static compile(details) { return [ FilterDataHolder.fid, details.dataType, details.data ]; } - static load(args) { + static unitFromCompiled(args) { const f = new FilterDataHolder(args[1], args[2]); - f.wrapped = filterFromCompiledData(args[3]); - return f; + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { + return new FilterDataHolder(args[1], args[2]); } }; @@ -1430,14 +1471,14 @@ registerFilterClass(FilterDataHolder); // be a match. const FilterDataHolderResult = class { - constructor(bits, th, f) { + constructor(bits, th, iunit) { this.bits = bits; this.th = th; - this.f = f; + this.iunit = iunit; } - get data() { - return this.f.data; + getData(type) { + return filterUnits[this.iunit].getData(type); } get result() { @@ -1445,7 +1486,7 @@ const FilterDataHolderResult = class { } logData() { - const r = toLogDataInternal(this.bits, this.th, this.f); + const r = toLogDataInternal(this.bits, this.th, this.iunit); r.source = 'static'; r.result = this.result; return r; @@ -1454,11 +1495,147 @@ const FilterDataHolderResult = class { /******************************************************************************/ +const FilterCollection = class { + constructor(i = 0, n = 0) { + this.i = i; + this.n = n; + } + + get size() { + return this.n; + } + + unshift(iunit) { + const j = this.i; + this.i = filterSequenceAdd(iunit, j); + this.n += 1; + } + + shift() { + const sequences = filterSequences; + filterUnits[sequences[this.i+0]] = null; + this.i = sequences[this.i+1]; + this.n -= 1; + } + + forEach(fn) { + let i = this.i; + if ( i === 0 ) { return; } + const sequences = filterSequences; + do { + const iunit = sequences[i+0]; + const r = fn(iunit); + if ( r !== undefined ) { return r; } + i = sequences[i+1]; + } while ( i !== 0 ); + } + + toSelfie() { + return [ this.fid, this.i, this.n ]; + } + + static compile(ctor, fdata) { + return [ ctor.fid, fdata ]; + } + + static unitFromCompiled(ctor, args) { + let iprev = 0, i0 = 0; + const n = args[1].length; + for ( let i = 0; i < n; i++ ) { + const iunit = filterUnitFromCompiled(args[1][i]); + const inext = filterSequenceAdd(iunit, 0); + if ( iprev !== 0 ) { + filterSequences[iprev+1] = inext; + } else { + i0 = inext; + } + iprev = inext; + } + return filterUnits.push(new ctor(i0, args[1].length)) - 1; + } + + static fromSelfie(ctor, args) { + return new ctor(args[1], args[2]); + } +}; + +/******************************************************************************/ + +const FilterComposite = class extends FilterCollection { + match() { + let i = this.i; + if ( i === 0 ) { return false; } + const sequences = filterSequences; + const units = filterUnits; + do { + if ( units[sequences[i+0]].match() !== true ) { return false; } + i = sequences[i+1]; + } while ( i !== 0 ); + return true; + } + + matchAndFetchData(type, out) { + if ( this.match() !== true ) { return false; } + this.forEach(iunit => { + const f = filterUnits[iunit]; + if ( f.matchAndFetchData instanceof Function === false ) { return; } + if ( f.matchAndFetchData(type) === false ) { return; } + if ( Array.isArray(out) ) { + out.push(this); + } + return true; + }); + } + + getData(type) { + return this.forEach(iunit => { + const f = filterUnits[iunit]; + if ( f.matchAndFetchData instanceof Function ) { + return f.getData(type); + } + }); + } + + // FilterPatternPlain is assumed to be first filter in sequence. This can + // be revisited if needed. + get isBidiTrieable() { + return filterUnits[filterSequences[this.i]].isBidiTrieable === true; + } + + toBidiTrie() { + const details = filterUnits[filterSequences[this.i]].toBidiTrie(); + this.shift(); + return details; + } + + logData(details) { + this.forEach(iunit => { + filterUnits[iunit].logData(details); + }); + } + + static compile(fdata) { + return FilterCollection.compile(FilterComposite, fdata); + } + + static unitFromCompiled(args) { + return FilterCollection.unitFromCompiled(FilterComposite, args); + } + + static fromSelfie(args) { + return FilterCollection.fromSelfie(FilterComposite, args); + } +}; + +registerFilterClass(FilterComposite); + +/******************************************************************************/ + // Dictionary of hostnames const FilterHostnameDict = class { constructor(args) { - this.h = ''; // short-lived register + this.$h = ''; // short-lived register this.dict = FilterHostnameDict.trieContainer.createOne(args); } @@ -1473,24 +1650,20 @@ const FilterHostnameDict = class { match() { const pos = this.dict.matches($requestHostname); if ( pos === -1 ) { return false; } - this.h = $requestHostname.slice(pos); + this.$h = $requestHostname.slice(pos); return true; } - logData() { - return { - raw: `||${this.h}^`, - regex: `${rawToRegexStr(this.h, 0)}(?:[^%.0-9a-z_-]|$)`, - compiled: this.h - }; + logData(details) { + details.pattern.push('||', this.$h, '^'); + details.regex.push(restrFromPlainPattern(this.$h), restrSeparator); } - compile() { - return [ this.fid, FilterHostnameDict.trieContainer.compileOne(this.dict) ]; - } - - static readyToUse() { - return FilterHostnameDict.trieContainer.readyToUse(); + toSelfie() { + return [ + this.fid, + FilterHostnameDict.trieContainer.compileOne(this.dict) + ]; } static reset() { @@ -1505,7 +1678,7 @@ const FilterHostnameDict = class { ); } - static load(args) { + static fromSelfie(args) { return new FilterHostnameDict(args[1]); } }; @@ -1518,7 +1691,7 @@ FilterHostnameDict.trieContainer = (( ) => { ); } catch(ex) { } - return new µBlock.HNTrieContainer(trieDetails); + return new µb.HNTrieContainer(trieDetails); })(); registerFilterClass(FilterHostnameDict); @@ -1530,7 +1703,7 @@ registerFilterClass(FilterHostnameDict); const FilterJustOrigin = class { constructor(args) { - this.h = ''; // short-lived register + this.$h = ''; // short-lived register this.dict = filterOrigin.trieContainer.createOne(args); } @@ -1545,24 +1718,26 @@ const FilterJustOrigin = class { match() { const pos = this.dict.matches($docHostname); if ( pos === -1 ) { return false; } - this.h = $docHostname.slice(pos); + this.$h = $docHostname.slice(pos); return true; } - logData() { - return { - raw: '*', - regex: '^', - compiled: this.h, - opts: `domain=${this.h}`, - }; + logData(details) { + details.pattern.push('*'); + details.regex.push('^'); + details.domains.push(this.$h); } - compile() { + toSelfie() { return [ this.fid, filterOrigin.trieContainer.compileOne(this.dict) ]; } - static load(args) { + static unitFromCompiled(args) { + const f = new FilterJustOrigin(args[1]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { return new FilterJustOrigin(args[1]); } }; @@ -1572,18 +1747,22 @@ registerFilterClass(FilterJustOrigin); /******************************************************************************/ const FilterHTTPSJustOrigin = class extends FilterJustOrigin { - match(url) { - return url.startsWith('https://') && super.match(); + match() { + return $requestURL.startsWith('https://') && super.match(); } - logData() { - const out = super.logData(); - out.raw = '|https://'; - out.regex = '^https://'; - return out; + logData(details) { + details.pattern.push('|https://'); + details.regex.push('^https://'); + details.domains.push(this.$h); } - static load(args) { + static unitFromCompiled(args) { + const f = new FilterHTTPSJustOrigin(args[1]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { return new FilterHTTPSJustOrigin(args[1]); } }; @@ -1593,18 +1772,22 @@ registerFilterClass(FilterHTTPSJustOrigin); /******************************************************************************/ const FilterHTTPJustOrigin = class extends FilterJustOrigin { - match(url) { - return url.startsWith('http://') && super.match(); + match() { + return $requestURL.startsWith('http://') && super.match(); } - logData() { - const out = super.logData(); - out.raw = '|https://'; - out.regex = '^https://'; - return out; + logData(details) { + details.pattern.push('|http://'); + details.regex.push('^http://'); + details.domains.push(this.$h); } - static load(args) { + static unitFromCompiled(args) { + const f = new FilterHTTPJustOrigin(args[1]); + return filterUnits.push(f) - 1; + } + + static fromSelfie(args) { return new FilterHTTPJustOrigin(args[1]); } }; @@ -1613,790 +1796,795 @@ registerFilterClass(FilterHTTPJustOrigin); /******************************************************************************/ -const FilterPair = class { - constructor(a, b) { - this.f1 = a; - this.f2 = b; - } - - get size() { - return 2; - } - - match(url, tokenBeg) { - if ( this.f1.match(url, tokenBeg) === true ) { - this.f = this.f1; - return true; - } - if ( this.f2.match(url, tokenBeg) === true ) { - this.f = this.f2; - return true; +const FilterBucket = class extends FilterCollection { + match() { + if ( this.plainTrie !== null ) { + if ( this.plainTrie.matches($tokenBeg) ) { + this.$matchedTrie = true; + this.$matchedUnit = this.plainTrie.$iu; + return true; + } } + let i = this.i; + if ( i === 0 ) { return false; } + const sequences = filterSequences; + const units = filterUnits; + do { + if ( units[sequences[i+0]].match() ) { + this.$matchedTrie = false; + this.$matchedUnit = sequences[i+0]; + return true; + } + i = sequences[i+1]; + } while ( i !== 0 ); return false; } - matchAndFetchData(type, url, tokenBeg, out) { - this.f1.matchAndFetchData(type, url, tokenBeg, out); - this.f2.matchAndFetchData(type, url, tokenBeg, out); + matchAndFetchData(type, out) { + const units = filterUnits; + this.forEach(iunit => { + units[iunit].matchAndFetchData(type, out); + }); } - logData() { - return this.f.logData(); - } - - compile(toSelfie = false) { - return [ - this.fid, - this.f1.compile(toSelfie), - this.f2.compile(toSelfie) - ]; - } - - upgrade(a) { - const bucket = new FilterBucket(this.f1, this.f2, a); - this.f1 = this.f2 = undefined; - this.f = null; - FilterPair.available = this; - return bucket; - } - - static load(args) { - const f1 = filterFromCompiledData(args[1]); - const f2 = filterFromCompiledData(args[2]); - const pair = FilterPair.available; - if ( pair === null ) { - return new FilterPair(f1, f2); + logData(details) { + if ( this.$matchedTrie ) { + const s = $requestURL.slice(this.plainTrie.$l, this.plainTrie.$r); + details.pattern.push(s); + details.regex.push(restrFromPlainPattern(s)); } - FilterPair.available = null; - pair.f1 = f1; - pair.f2 = f2; - return pair; - } -}; - -FilterPair.prototype.f = null; - -FilterPair.available = null; - -registerFilterClass(FilterPair); - -/******************************************************************************/ - -const FilterBucket = class { - constructor(a, b, c) { - this.filters = []; - if ( a !== undefined ) { - this.filters.push(a, b, c); - this._countTrieable(); + if ( this.$matchedUnit !== -1 ) { + filterUnits[this.$matchedUnit].logData(details); } - this.trieResult = 0; } - get size() { - let size = this.filters.length; + toSelfie() { + const selfie = super.toSelfie(); if ( this.plainTrie !== null ) { - size += this.plainTrie.size; + selfie.push(bidiTrie.compileOne(this.plainTrie)); } - if ( this.plainHnAnchoredTrie !== null ) { - size += this.plainHnAnchoredTrie.size; - } - return size; + return selfie; } - add(fdata) { - const fclass = filterClasses[fdata[0]]; - if ( fclass.trieableId === 0 ) { - if ( this.plainTrie !== null ) { - if ( fclass.addToTrie(fdata, this.plainTrie) ) { return; } - } else if ( this.plainCount < 3 ) { - this.plainCount += 1; + optimize() { + const units = filterUnits; + const trieables = new Set(); + let i = this.i; + for (;;) { + const f = units[filterSequences[i+0]]; + if ( f.isBidiTrieable === true ) { + trieables.add(i); + } + i = filterSequences[i+1]; + if ( i === 0 ) { break; } + } + if ( trieables.size <= 2 ) { return; } + if ( this.plainTrie === null ) { + this.plainTrie = bidiTrie.createOne(); + } + i = this.i; + let iprev = 0; + for (;;) { + const iunit = filterSequences[i+0]; + const inext = filterSequences[i+1]; + if ( trieables.has(i) ) { + this._addToTrie(iunit); + if ( iprev !== 0 ) { + filterSequences[iprev+1] = inext; + } else { + this.i = inext; + } } else { - this.plainTrie = FilterBucket.trieContainer.createOne(); - this._transferTrieable(0, this.plainTrie); - if ( fclass.addToTrie(fdata, this.plainTrie) ) { return; } + iprev = i; } - } else if ( fclass.trieableId === 1 ) { - if ( this.plainHnAnchoredTrie !== null ) { - if ( fclass.addToTrie(fdata, this.plainHnAnchoredTrie) ) { return; } - } else if ( this.plainHnAnchoredCount < 3 ) { - this.plainHnAnchoredCount += 1; - } else { - this.plainHnAnchoredTrie = FilterBucket.trieContainer.createOne(); - this._transferTrieable(1, this.plainHnAnchoredTrie); - if ( fclass.addToTrie(fdata, this.plainHnAnchoredTrie) ) { return; } - } - } - this.filters.push(filterFromCompiledData(fdata)); - } - - match(url, tokenBeg) { - if ( this.plainTrie !== null ) { - const pos = this.plainTrie.matches(url, tokenBeg); - if ( pos !== -1 ) { - this.trieResult = pos; - this.f = this.plainFilter; - this.f.tokenBeg = tokenBeg - (pos >>> 16); - return true; - } - } - if ( this.plainHnAnchoredTrie !== null ) { - const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg); - if ( pos !== -1 && isHnAnchored(url, pos >>> 16) ) { - this.trieResult = pos; - this.f = this.plainHnAnchoredFilter; - this.f.tokenBeg = tokenBeg - (pos >>> 16); - return true; - } - } - const filters = this.filters; - for ( let i = 0, n = filters.length; i < n; i++ ) { - if ( filters[i].match(url, tokenBeg) === true ) { - this.f = filters[i]; - if ( i >= 16 ) { this._promote(i); } - return true; - } - } - return false; - } - - matchAndFetchData(type, url, tokenBeg, out) { - for ( const f of this.filters ) { - f.matchAndFetchData(type, url, tokenBeg, out); + if ( inext === 0 ) { break; } + i = inext; } } - logData() { - if ( - this.f === this.plainFilter || - this.f === this.plainHnAnchoredFilter - ) { - this.f.s = $requestURL.slice( - this.trieResult >>> 16, - this.trieResult & 0xFFFF - ); - } - return this.f.logData(); - } - - compile(toSelfie = false) { - return [ - this.fid, - this.filters.map(filter => filter.compile(toSelfie)), - this.plainTrie !== null && - FilterBucket.trieContainer.compileOne(this.plainTrie), - this.plainHnAnchoredTrie !== null && - FilterBucket.trieContainer.compileOne(this.plainHnAnchoredTrie), - ]; - } - - _countTrieable() { - for ( const f of this.filters ) { - if ( f.trieableId === 0 ) { - this.plainCount += 1; - } else if ( f.trieableId === 1 ) { - this.plainHnAnchoredCount += 1; - } - } - } - - _transferTrieable(trieableId, trie) { - const filters = this.filters; - let i = filters.length; - while ( i-- ) { - const f = filters[i]; - if ( f.trieableId !== trieableId ) { continue; } - if ( f.addToTrie(trie) === false ) { continue; } - filters.splice(i, 1); - } - } - - // Promote hit filters so they can be found faster next time. - _promote(i) { - const filters = this.filters; - let pivot = filters.length >>> 1; - while ( i < pivot ) { - pivot >>>= 1; - if ( pivot < 16 ) { break; } - } - if ( i <= pivot ) { return; } - const j = this.promoted % pivot; - //console.debug('FilterBucket.promote(): promoted %d to %d', i, j); - const f = filters[j]; - filters[j] = filters[i]; - filters[i] = f; - this.promoted += 1; - } - - static reset() { - FilterBucket.trieContainer.reset(); - } - - static optimize() { - const trieDetails = FilterBucket.trieContainer.optimize(); - vAPI.localStorage.setItem( - 'FilterBucket.trieDetails', - JSON.stringify(trieDetails) + _addToTrie(iunit) { + const f = filterUnits[iunit]; + const trieDetails = f.toBidiTrie(); + const id = this.plainTrie.add( + trieDetails.i, + trieDetails.n, + trieDetails.itok ); + // No point storing a pattern with conditions if the bidi-trie already + // contain a pattern with no conditions. + let ix = this.plainTrie.getExtra(id); + if ( ix === 1 ) { + filterUnits[iunit] = null; + return; + } + // If the newly stored pattern has no condition, shortcut existing + // ones since they will always be short-circuited by the + // condition-less pattern. + if ( f instanceof FilterPatternPlain ) { + this.plainTrie.setExtra(id, 1); + filterUnits[iunit] = null; + return; + } + // FilterComposite is assumed here, i.e. with conditions. + if ( f.n === 1 ) { + filterUnits[iunit] = null; + iunit = filterSequences[f.i]; + } + this.plainTrie.setExtra(id, filterSequenceAdd(iunit, ix)); } - static load(args) { - const bucket = new FilterBucket(); - bucket.filters = args[1].map(data => filterFromCompiledData(data)); - if ( Array.isArray(args[2]) ) { - bucket.plainTrie = - FilterBucket.trieContainer.createOne(args[2]); - } - if ( Array.isArray(args[3]) ) { - bucket.plainHnAnchoredTrie = - FilterBucket.trieContainer.createOne(args[3]); + static fromSelfie(args) { + const bucket = FilterCollection.fromSelfie(FilterBucket, args); + if ( args.length > 3 && Array.isArray(args[3]) ) { + bucket.plainTrie = bidiTrie.createOne(args[3]); } return bucket; } }; -FilterBucket.prototype.f = null; -FilterBucket.prototype.promoted = 0; - -FilterBucket.prototype.plainCount = 0; FilterBucket.prototype.plainTrie = null; -FilterBucket.prototype.plainFilter = new FilterPlainX('', 0); - -FilterBucket.prototype.plainHnAnchoredCount = 0; -FilterBucket.prototype.plainHnAnchoredTrie = null; -FilterBucket.prototype.plainHnAnchoredFilter = new FilterPlainHnAnchoredX('', 0); - -FilterBucket.trieContainer = (( ) => { - let trieDetails; - try { - trieDetails = JSON.parse( - vAPI.localStorage.getItem('FilterBucket.trieDetails') - ); - } catch(ex) { - } - return new µBlock.BidiTrieContainer(trieDetails); -})(); +FilterBucket.prototype.$matchedUnit = 0; +FilterBucket.prototype.$matchedTrie = false; registerFilterClass(FilterBucket); -/******************************************************************************/ /******************************************************************************/ -const FilterParser = function() { - this.cantWebsocket = vAPI.cantWebsocket; - this.reBadDomainOptChars = /[*+?^${}()[\]\\]/; - this.reHostnameRule1 = /^\w[\w.-]*[a-z]$/i; - this.reHostnameRule2 = /^\w[\w.-]*[a-z]\^?$/i; - this.reCanTrimCarets1 = /^[^*]*$/; - this.reCanTrimCarets2 = /^\^?[^^]+[^^][^^]+\^?$/; - this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/; - this.reHasUnicode = /[^\x00-\x7F]/; - this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/; - this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/; - this.domainOpt = ''; - this.noTokenHash = µb.urlTokenizer.noTokenHash; - this.reset(); -}; +const FILTER_UNITS_MIN = filterUnits.length; +const FILTER_SEQUENCES_MIN = filterSequenceWritePtr; /******************************************************************************/ - -FilterParser.prototype.reset = function() { - this.action = BlockAction; - this.anchor = 0; - this.badFilter = false; - this.dataType = undefined; - this.data = undefined; - this.elemHiding = false; - this.f = ''; - this.firstParty = false; - this.thirdParty = false; - this.party = AnyParty; - this.fopts = ''; - this.domainOpt = ''; - this.isPureHostname = false; - this.isRegex = false; - this.raw = ''; - this.redirect = 0; - this.token = '*'; - this.tokenHash = this.noTokenHash; - this.tokenBeg = 0; - this.types = 0; - this.notTypes = 0; - this.important = 0; - this.wildcarded = false; - this.unsupported = false; - return this; -}; - /******************************************************************************/ -FilterParser.prototype.bitFromType = function(type) { - return 1 << ((typeNameToTypeValue[type] >>> 4) - 1); -}; - -/******************************************************************************/ - -// https://github.com/chrisaljoudi/uBlock/issues/589 -// Be ready to handle multiple negated types - -FilterParser.prototype.parseTypeOption = function(raw, not) { - const typeBit = raw !== 'all' - ? this.bitFromType(toNormalizedType[raw]) - : allTypesBits; - - if ( not ) { - this.notTypes |= typeBit; - } else { - this.types |= typeBit; +const FilterParser = class { + constructor() { + this.cantWebsocket = vAPI.cantWebsocket; + this.domainOpt = ''; + this.noTokenHash = urlTokenizer.noTokenHash; + this.reBadDomainOptChars = /[*+?^${}()[\]\\]/; + this.reHostnameRule1 = /^\w[\w.-]*[a-z]$/i; + this.reHostnameRule2 = /^\w[\w.-]*[a-z]\^?$/i; + this.reCanTrimCarets1 = /^[^*]*$/; + this.reCanTrimCarets2 = /^\^?[^^]+[^^][^^]+\^?$/; + this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/; + this.reHasUnicode = /[^\x00-\x7F]/; + this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/; + this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/; + this.reGoodToken = /[%0-9a-z]{1,}/g; + this.reSeparator = /[\/^]/; + this.reRegexToken = /[%0-9A-Za-z]{2,}/g; + this.reRegexTokenAbort = /[([]/; + this.reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; + this.reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/; + // These top 100 "bad tokens" are collated using the "miss" histogram + // from tokenHistograms(). The "score" is their occurrence among the + // 200K+ URLs used in the benchmark and executed against default + // filter lists. + this.badTokens = new Map([ + [ 'https',123617 ], + [ 'com',76987 ], + [ 'js',43620 ], + [ 'www',33129 ], + [ 'jpg',32221 ], + [ 'images',31812 ], + [ 'css',19715 ], + [ 'png',19140 ], + [ 'static',15724 ], + [ 'net',15239 ], + [ 'de',13155 ], + [ 'img',11109 ], + [ 'assets',10746 ], + [ 'min',7807 ], + [ 'cdn',7568 ], + [ 'content',6900 ], + [ 'wp',6444 ], + [ 'fonts',6095 ], + [ 'svg',5976 ], + [ 'http',5813 ], + [ 'ssl',5735 ], + [ 'amazon',5440 ], + [ 'ru',5427 ], + [ 'fr',5199 ], + [ 'facebook',5178 ], + [ 'en',5146 ], + [ 'image',5028 ], + [ 'html',4837 ], + [ 'media',4833 ], + [ 'co',4783 ], + [ 'php',3972 ], + [ '2019',3943 ], + [ 'org',3924 ], + [ 'jquery',3531 ], + [ '02',3438 ], + [ 'api',3382 ], + [ 'gif',3350 ], + [ 'eu',3322 ], + [ 'prod',3289 ], + [ 'woff2',3200 ], + [ 'logo',3194 ], + [ 'themes',3107 ], + [ 'icon',3048 ], + [ 'google',3026 ], + [ 'v1',3019 ], + [ 'uploads',2963 ], + [ 'googleapis',2860 ], + [ 'v3',2816 ], + [ 'tv',2762 ], + [ 'icons',2748 ], + [ 'core',2601 ], + [ 'gstatic',2581 ], + [ 'ac',2509 ], + [ 'utag',2466 ], + [ 'id',2459 ], + [ 'ver',2448 ], + [ 'rsrc',2387 ], + [ 'files',2361 ], + [ 'uk',2357 ], + [ 'us',2271 ], + [ 'pl',2262 ], + [ 'common',2205 ], + [ 'public',2076 ], + [ '01',2016 ], + [ 'na',1957 ], + [ 'v2',1954 ], + [ '12',1914 ], + [ 'thumb',1895 ], + [ 'web',1853 ], + [ 'ui',1841 ], + [ 'default',1825 ], + [ 'main',1737 ], + [ 'false',1715 ], + [ '2018',1697 ], + [ 'embed',1639 ], + [ 'player',1634 ], + [ 'dist',1599 ], + [ 'woff',1593 ], + [ 'global',1593 ], + [ 'json',1572 ], + [ '11',1566 ], + [ '600',1559 ], + [ 'app',1556 ], + [ 'styles',1533 ], + [ 'plugins',1526 ], + [ '274',1512 ], + [ 'random',1505 ], + [ 'sites',1505 ], + [ 'imasdk',1501 ], + [ 'bridge3',1501 ], + [ 'news',1496 ], + [ 'width',1494 ], + [ 'thumbs',1485 ], + [ 'ttf',1470 ], + [ 'ajax',1463 ], + [ 'user',1454 ], + [ 'scripts',1446 ], + [ 'twitter',1440 ], + [ 'crop',1431 ], + [ 'new',1412] + ]); + this.maxTokenLen = urlTokenizer.MAX_TOKEN_LENGTH; + this.reset(); } -}; -/******************************************************************************/ - -FilterParser.prototype.parsePartyOption = function(firstParty, not) { - if ( firstParty ) { - not = !not; + reset() { + this.action = BlockAction; + // anchor: bit vector + // 0000 (0x0): no anchoring + // 0001 (0x1): anchored to the end of the URL. + // 0010 (0x2): anchored to the start of the URL. + // 0011 (0x3): anchored to the start and end of the URL. + // 0100 (0x4): anchored to the hostname of the URL. + // 0101 (0x5): anchored to the hostname and end of the URL. + this.anchor = 0; + this.badFilter = false; + this.dataType = undefined; + this.data = undefined; + this.invalid = false; + this.f = ''; + this.firstParty = false; + this.thirdParty = false; + this.party = AnyParty; + this.fopts = ''; + this.domainOpt = ''; + this.isPureHostname = false; + this.isRegex = false; + this.raw = ''; + this.redirect = 0; + this.token = '*'; + this.tokenHash = this.noTokenHash; + this.tokenBeg = 0; + this.typeBits = 0; + this.notTypes = 0; + this.important = 0; + this.firstWildcardPos = -1; + this.secondWildcardPos = -1; + this.firstCaretPos = -1; + this.secondCaretPos = -1; + this.unsupported = false; + return this; } - if ( not ) { - this.firstParty = true; - this.party = this.thirdParty ? AnyParty : FirstParty; - } else { - this.thirdParty = true; - this.party = this.firstParty ? AnyParty : ThirdParty; - } -}; -/******************************************************************************/ - -FilterParser.prototype.parseDomainOption = function(s) { - if ( this.reHasUnicode.test(s) ) { - const hostnames = s.split('|'); - let i = hostnames.length; - while ( i-- ) { - if ( this.reHasUnicode.test(hostnames[i]) ) { - hostnames[i] = punycode.toASCII(hostnames[i]); - } + normalizeRegexSource(s) { + try { + const re = new RegExp(s); + return re.source; + } catch (ex) { } - s = hostnames.join('|'); + return ''; } - if ( this.reBadDomainOptChars.test(s) ) { return ''; } - return s; -}; -/******************************************************************************/ + bitFromType(type) { + return 1 << ((typeNameToTypeValue[type] >>> 4) - 1); + } -FilterParser.prototype.parseOptions = function(s) { - this.fopts = s; - for ( let opt of s.split(/\s*,\s*/) ) { - const not = opt.startsWith('~'); + // https://github.com/chrisaljoudi/uBlock/issues/589 + // Be ready to handle multiple negated types + + parseTypeOption(raw, not) { + const typeBit = raw !== 'all' + ? this.bitFromType(toNormalizedType[raw]) + : allTypesBits; if ( not ) { - opt = opt.slice(1); + this.notTypes |= typeBit; + } else { + this.typeBits |= typeBit; } - if ( opt === 'third-party' || opt === '3p' ) { - this.parsePartyOption(false, not); - continue; - } - if ( opt === 'first-party' || opt === '1p' ) { - this.parsePartyOption(true, not); - continue; - } - if ( toNormalizedType.hasOwnProperty(opt) ) { - this.parseTypeOption(opt, not); - continue; - } - // https://github.com/gorhill/uBlock/issues/2294 - // Detect and discard filter if domain option contains nonsensical - // characters. - if ( opt.startsWith('domain=') ) { - this.domainOpt = this.parseDomainOption(opt.slice(7)); - if ( this.domainOpt === '' ) { - this.unsupported = true; - break; - } - continue; - } - if ( opt === 'important' ) { - this.important = Important; - continue; - } - if ( /^redirect(?:-rule)?=/.test(opt) ) { - if ( this.redirect !== 0 ) { - this.unsupported = true; - break; - } - this.redirect = opt.charCodeAt(8) === 0x3D /* '=' */ ? 1 : 2; - continue; - } - if ( - opt.startsWith('csp=') && - opt.length > 4 && - this.reBadCSP.test(opt) === false - ) { - this.parseTypeOption('data', not); - this.dataType = 'csp'; - this.data = opt.slice(4).trim(); - continue; - } - if ( opt === 'csp' && this.action === AllowAction ) { - this.parseTypeOption('data', not); - this.dataType = 'csp'; - this.data = ''; - continue; - } - // Used by Adguard: - // https://kb.adguard.com/en/general/how-to-create-your-own-ad-filters?aid=16593#empty-modifier - if ( opt === 'empty' || opt === 'mp4' ) { - if ( this.redirect !== 0 ) { - this.unsupported = true; - break; - } - this.redirect = 1; - continue; - } - // https://github.com/uBlockOrigin/uAssets/issues/192 - if ( opt === 'badfilter' ) { - this.badFilter = true; - continue; - } - // https://www.reddit.com/r/uBlockOrigin/comments/d6vxzj/ - // Add support for `elemhide`. Rarely used but it happens. - if ( opt === 'elemhide' || opt === 'ehide' ) { - this.parseTypeOption('specifichide', not); - this.parseTypeOption('generichide', not); - continue; - } - // Unrecognized filter option: ignore whole filter. - this.unsupported = true; - break; } - // Redirect rules can't be exception filters. - if ( this.redirect !== 0 && this.action !== BlockAction ) { - this.unsupported = true; + parsePartyOption(firstParty, not) { + if ( firstParty ) { + not = !not; + } + if ( not ) { + this.firstParty = true; + this.party = this.thirdParty ? AnyParty : FirstParty; + } else { + this.thirdParty = true; + this.party = this.firstParty ? AnyParty : ThirdParty; + } } - // Negated network types? Toggle on all network type bits. - // Negated non-network types can only toggle themselves. - if ( (this.notTypes & allNetworkTypesBits) !== 0 ) { - this.types |= allNetworkTypesBits; + parseDomainOption(s) { + if ( this.reHasUnicode.test(s) ) { + const hostnames = s.split('|'); + let i = hostnames.length; + while ( i-- ) { + if ( this.reHasUnicode.test(hostnames[i]) ) { + hostnames[i] = punycode.toASCII(hostnames[i]); + } + } + s = hostnames.join('|'); + } + if ( this.reBadDomainOptChars.test(s) ) { return ''; } + return s; } - if ( this.notTypes !== 0 ) { - this.types &= ~this.notTypes; - if ( this.types === 0 ) { + + parseOptions(s) { + this.fopts = s; + for ( let opt of s.split(/\s*,\s*/) ) { + const not = opt.startsWith('~'); + if ( not ) { + opt = opt.slice(1); + } + if ( opt === 'third-party' || opt === '3p' ) { + this.parsePartyOption(false, not); + continue; + } + if ( opt === 'first-party' || opt === '1p' ) { + this.parsePartyOption(true, not); + continue; + } + if ( toNormalizedType.hasOwnProperty(opt) ) { + this.parseTypeOption(opt, not); + continue; + } + // https://github.com/gorhill/uBlock/issues/2294 + // Detect and discard filter if domain option contains nonsensical + // characters. + if ( opt.startsWith('domain=') ) { + this.domainOpt = this.parseDomainOption(opt.slice(7)); + if ( this.domainOpt === '' ) { + this.unsupported = true; + break; + } + continue; + } + if ( opt === 'important' ) { + this.important = Important; + continue; + } + if ( /^redirect(?:-rule)?=/.test(opt) ) { + if ( this.redirect !== 0 ) { + this.unsupported = true; + break; + } + this.redirect = opt.charCodeAt(8) === 0x3D /* '=' */ ? 1 : 2; + continue; + } + if ( + opt.startsWith('csp=') && + opt.length > 4 && + this.reBadCSP.test(opt) === false + ) { + this.parseTypeOption('data', not); + this.dataType = 'csp'; + this.data = opt.slice(4).trim(); + continue; + } + if ( opt === 'csp' && this.action === AllowAction ) { + this.parseTypeOption('data', not); + this.dataType = 'csp'; + this.data = ''; + continue; + } + // Used by Adguard: + // https://kb.adguard.com/en/general/how-to-create-your-own-ad-filters?aid=16593#empty-modifier + if ( opt === 'empty' || opt === 'mp4' ) { + if ( this.redirect !== 0 ) { + this.unsupported = true; + break; + } + this.redirect = 1; + continue; + } + // https://github.com/uBlockOrigin/uAssets/issues/192 + if ( opt === 'badfilter' ) { + this.badFilter = true; + continue; + } + // https://www.reddit.com/r/uBlockOrigin/comments/d6vxzj/ + // Add support for `elemhide`. Rarely used but it happens. + if ( opt === 'elemhide' || opt === 'ehide' ) { + this.parseTypeOption('specifichide', not); + this.parseTypeOption('generichide', not); + continue; + } + // Unrecognized filter option: ignore whole filter. + this.unsupported = true; + break; + } + + // Redirect rules can't be exception filters. + if ( this.redirect !== 0 && this.action !== BlockAction ) { this.unsupported = true; } - } - // https://github.com/gorhill/uBlock/issues/2283 - // Abort if type is only for unsupported types, otherwise - // toggle off `unsupported` bit. - if ( this.types & unsupportedTypeBit ) { - this.types &= ~unsupportedTypeBit; - if ( this.types === 0 ) { - this.unsupported = true; + // Negated network types? Toggle on all network type bits. + // Negated non-network types can only toggle themselves. + if ( (this.notTypes & allNetworkTypesBits) !== 0 ) { + this.typeBits |= allNetworkTypesBits; + } + if ( this.notTypes !== 0 ) { + this.typeBits &= ~this.notTypes; + if ( this.typeBits === 0 ) { + this.unsupported = true; + } + } + + // https://github.com/gorhill/uBlock/issues/2283 + // Abort if type is only for unsupported types, otherwise + // toggle off `unsupported` bit. + if ( this.typeBits & unsupportedTypeBit ) { + this.typeBits &= ~unsupportedTypeBit; + if ( this.typeBits === 0 ) { + this.unsupported = true; + } } } -}; -/******************************************************************************* + // TODO: use charCodeAt where possible. - anchor: bit vector - 0000 (0x0): no anchoring - 0001 (0x1): anchored to the end of the URL. - 0010 (0x2): anchored to the start of the URL. - 0011 (0x3): anchored to the start and end of the URL. - 0100 (0x4): anchored to the hostname of the URL. - 0101 (0x5): anchored to the hostname and end of the URL. + parse(raw) { + // important! + this.reset(); -**/ + let s = this.raw = raw.trim(); -FilterParser.prototype.parse = function(raw) { - // important! - this.reset(); - - let s = this.raw = raw; - - // Filters which are a single alphanumeric character are discarded - // as unsupported. - if ( s.length === 1 && /[0-9a-z]/i.test(s) ) { - this.unsupported = true; - return this; - } - - // plain hostname? (from HOSTS file) - if ( this.reHostnameRule1.test(s) ) { - this.f = s.toLowerCase(); - this.isPureHostname = true; - this.anchor |= 0x4; - return this; - } - - // element hiding filter? - let pos = s.indexOf('#'); - if ( pos !== -1 ) { - const c = s.charAt(pos + 1); - if ( c === '#' || c === '@' ) { - console.error('static-net-filtering.js > unexpected cosmetic filters'); - this.elemHiding = true; + if ( s.length === 0 ) { + this.invalid = true; return this; } - } - // block or allow filter? - // Important: this must be executed before parsing options - if ( s.startsWith('@@') ) { - this.action = AllowAction; - s = s.slice(2); - } + // Filters which are a single alphanumeric character are discarded + // as unsupported. + if ( s.length === 1 && /[0-9a-z]/i.test(s) ) { + this.unsupported = true; + return this; + } - // options - // https://github.com/gorhill/uBlock/issues/842 - // - ensure sure we are not dealing with a regex-based filter. - // - lookup the last occurrence of `$`. - if ( s.startsWith('/') === false || s.endsWith('/') === false ) { - pos = s.lastIndexOf('$'); + // plain hostname? (from HOSTS file) + if ( this.reHostnameRule1.test(s) ) { + this.f = s.toLowerCase(); + this.isPureHostname = true; + this.anchor |= 0b100; + return this; + } + + // element hiding filter? + let pos = s.indexOf('#'); if ( pos !== -1 ) { - // https://github.com/gorhill/uBlock/issues/952 - // Discard Adguard-specific `$$` filters. - if ( s.indexOf('$$') !== -1 ) { + const c = s.charAt(pos + 1); + if ( c === '#' || c === '@' ) { + console.error('static-net-filtering.js > unexpected cosmetic filters'); + this.invalid = true; + return this; + } + } + + // block or allow filter? + // Important: this must be executed before parsing options + if ( s.startsWith('@@') ) { + this.action = AllowAction; + s = s.slice(2); + } + + // options + // https://github.com/gorhill/uBlock/issues/842 + // - ensure sure we are not dealing with a regex-based filter. + // - lookup the last occurrence of `$`. + if ( + s.charCodeAt(0) !== 0x2F /* '/' */ || + s.charCodeAt(s.length - 1) !== 0x2F /* '/' */ + ) { + pos = s.lastIndexOf('$'); + if ( pos !== -1 ) { + // https://github.com/gorhill/uBlock/issues/952 + // Discard Adguard-specific `$$` filters. + if ( s.indexOf('$$') !== -1 ) { + this.unsupported = true; + return this; + } + this.parseOptions(s.slice(pos + 1)); + if ( this.unsupported ) { return this; } + s = s.slice(0, pos); + } + } + + // regex? + if ( + s.length > 2 && + s.charCodeAt(0) === 0x2F /* '/' */ && + s.charCodeAt(s.length - 1) === 0x2F /* '/' */ + ) { + this.isRegex = true; + this.f = s.slice(1, -1); + // https://github.com/gorhill/uBlock/issues/1246 + // If the filter is valid, use the corrected version of the + // source string -- this ensure reverse-lookup will work fine. + this.f = this.normalizeRegexSource(this.f); + if ( this.f === '' ) { + this.unsupported = true; + } + return this; + } + + // hostname-anchored + if ( s.startsWith('||') ) { + this.anchor |= 0x4; + s = s.slice(2); + + // convert hostname to punycode if needed + // https://github.com/gorhill/uBlock/issues/2599 + if ( this.reHasUnicode.test(s) ) { + const matches = this.reIsolateHostname.exec(s); + if ( matches ) { + s = (matches[1] !== undefined ? matches[1] : '') + + punycode.toASCII(matches[2]) + + matches[3]; + } + } + + // https://github.com/chrisaljoudi/uBlock/issues/1096 + if ( s.startsWith('^') ) { this.unsupported = true; return this; } - this.parseOptions(s.slice(pos + 1)); - if ( this.unsupported ) { return this; } - s = s.slice(0, pos); - } - } - // regex? - if ( s.startsWith('/') && s.endsWith('/') && s.length > 2 ) { - this.isRegex = true; - this.f = s.slice(1, -1); - // https://github.com/gorhill/uBlock/issues/1246 - // If the filter is valid, use the corrected version of the source - // string -- this ensure reverse-lookup will work fine. - this.f = normalizeRegexSource(this.f); - if ( this.f === '' ) { - console.error( - "uBlock Origin> discarding bad regular expression-based network filter '%s': '%s'", - raw, - normalizeRegexSource.message - ); - this.unsupported = true; + // plain hostname? (from ABP filter list) + // https://github.com/gorhill/uBlock/issues/1757 + // A filter can't be a pure-hostname one if there is a domain or + // csp option present. + if ( this.reHostnameRule2.test(s) ) { + if ( s.charCodeAt(s.length - 1) === 0x5E /* '^' */ ) { + s = s.slice(0, -1); + } + this.f = s.toLowerCase(); + this.isPureHostname = true; + return this; + } } + + // left-anchored + else if ( s.startsWith('|') ) { + this.anchor |= 0x2; + s = s.slice(1); + } + + // right-anchored + if ( s.endsWith('|') ) { + this.anchor |= 0x1; + s = s.slice(0, -1); + } + + // https://github.com/gorhill/uBlock/issues/1669#issuecomment-224822448 + // Remove pointless leading *. + // https://github.com/gorhill/uBlock/issues/3034 + // We can remove anchoring if we need to match all at the start. + if ( s.startsWith('*') ) { + s = s.replace(/^\*+([^%0-9a-z])/i, '$1'); + this.anchor &= ~0x6; + } + // Remove pointless trailing * + // https://github.com/gorhill/uBlock/issues/3034 + // We can remove anchoring if we need to match all at the end. + if ( s.endsWith('*') ) { + s = s.replace(/([^%0-9a-z])\*+$/i, '$1'); + this.anchor &= ~0x1; + } + + // nothing left? + if ( s === '' ) { + s = '*'; + } + // TODO: remove once redirect rules with `*/*` pattern are no longer + // used. + else if ( this.redirect !== 0 && s === '/' ) { + s = '*'; + } + + // https://github.com/gorhill/uBlock/issues/1047 + // Hostname-anchored makes no sense if matching all requests. + if ( s === '*' ) { + this.anchor = 0; + } + + this.firstWildcardPos = s.indexOf('*'); + if ( this.firstWildcardPos !== -1 ) { + this.secondWildcardPos = s.indexOf('*', this.firstWildcardPos + 1); + } + this.firstCaretPos = s.indexOf('^'); + if ( this.firstCaretPos !== -1 ) { + this.secondCaretPos = s.indexOf('^', this.firstCaretPos + 1); + } + + if ( s.length > 1024 ) { + this.unsupported = true; + return this; + } + + this.f = s.toLowerCase(); + return this; } - // hostname-anchored - if ( s.startsWith('||') ) { - this.anchor |= 0x4; - s = s.slice(2); + // Given a string, find a good token. Tokens which are too generic, + // i.e. very common with a high probability of ending up as a miss, + // are not good. Avoid if possible. This has a significant positive + // impact on performance. - // convert hostname to punycode if needed - // https://github.com/gorhill/uBlock/issues/2599 - if ( this.reHasUnicode.test(s) ) { - const matches = this.reIsolateHostname.exec(s); - if ( matches ) { - s = (matches[1] !== undefined ? matches[1] : '') + - punycode.toASCII(matches[2]) + - matches[3]; - //console.debug('µBlock.staticNetFilteringEngine/FilterParser.parse():', raw, '=', s); - } + makeToken() { + if ( this.isRegex ) { + this.extractTokenFromRegex(); + return; } - - // https://github.com/chrisaljoudi/uBlock/issues/1096 - if ( s.startsWith('^') ) { - this.unsupported = true; - return this; - } - - // plain hostname? (from ABP filter list) - // https://github.com/gorhill/uBlock/issues/1757 - // A filter can't be a pure-hostname one if there is a domain or csp - // option present. - if ( this.reHostnameRule2.test(s) ) { - if ( s.charCodeAt(s.length - 1) === 0x5E /* '^' */ ) { - s = s.slice(0, -1); - } - this.f = s.toLowerCase(); - this.isPureHostname = true; - return this; - } - } - // left-anchored - else if ( s.startsWith('|') ) { - this.anchor |= 0x2; - s = s.slice(1); - } - - // right-anchored - if ( s.endsWith('|') ) { - this.anchor |= 0x1; - s = s.slice(0, -1); - } - - // https://github.com/gorhill/uBlock/issues/1669#issuecomment-224822448 - // remove pointless leading *. - // https://github.com/gorhill/uBlock/issues/3034 - // - We can remove anchoring if we need to match all at the start. - if ( s.startsWith('*') ) { - s = s.replace(/^\*+([^%0-9a-z])/i, '$1'); - this.anchor &= ~0x6; - } - // remove pointless trailing * - // https://github.com/gorhill/uBlock/issues/3034 - // - We can remove anchoring if we need to match all at the end. - if ( s.endsWith('*') ) { - s = s.replace(/([^%0-9a-z])\*+$/i, '$1'); - this.anchor &= ~0x1; - } - - // nothing left? - if ( s === '' ) { - s = '*'; - } - // TODO: remove once redirect rules with `*/*` pattern are no longer used. - else if ( this.redirect !== 0 && s === '/' ) { - s = '*'; - } - - // https://github.com/gorhill/uBlock/issues/1047 - // Hostname-anchored makes no sense if matching all requests. - if ( s === '*' ) { - this.anchor = 0; - } - - this.wildcarded = reIsWildcarded.test(s); - this.f = s.toLowerCase(); - - return this; -}; - -/******************************************************************************/ - -// Given a string, find a good token. Tokens which are too generic, i.e. very -// common with a high probability of ending up as a miss, are not -// good. Avoid if possible. This has a *significant* positive impact on -// performance. -// These "bad tokens" are collated manually. - -// Hostname-anchored with no wildcard always have a token index of 0. -const reGoodToken = /[%0-9a-z]{2,}/g; -const reRegexToken = /[%0-9A-Za-z]{2,}/g; -const reRegexTokenAbort = /[([]/; -const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; -const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/; - -const badTokens = new Set([ - 'com', - 'google', - 'http', - 'https', - 'icon', - 'images', - 'img', - 'js', - 'net', - 'news', - 'www' -]); - -FilterParser.prototype.findFirstGoodToken = function() { - reGoodToken.lastIndex = 0; - const s = this.f; - let matches; - let badTokenMatch = null; - while ( (matches = reGoodToken.exec(s)) !== null ) { - // https://github.com/gorhill/uBlock/issues/997 - // Ignore token if preceded by wildcard. - const lpos = matches.index; - if ( lpos !== 0 && s.charCodeAt(lpos - 1) === 0x2A /* '*' */ ) { - continue; - } - if ( s.charCodeAt(reGoodToken.lastIndex) === 0x2A /* '*' */ ) { - continue; - } - if ( badTokens.has(matches[0]) ) { - if ( badTokenMatch === null ) { - badTokenMatch = matches; - } - continue; - } - return matches; - } - return badTokenMatch; -}; - -FilterParser.prototype.extractTokenFromRegex = function() { - reRegexToken.lastIndex = 0; - const s = this.f; - let matches; - while ( (matches = reRegexToken.exec(s)) !== null ) { - const prefix = s.slice(0, matches.index); - if ( reRegexTokenAbort.test(prefix) ) { return; } - if ( - reRegexBadPrefix.test(prefix) || - reRegexBadSuffix.test(s.slice(reRegexToken.lastIndex)) - ) { - continue; - } - this.token = matches[0].toLowerCase(); - this.tokenHash = µb.urlTokenizer.tokenHashFromString(this.token); - this.tokenBeg = matches.index; - if ( badTokens.has(this.token) === false ) { break; } - } -}; - -/******************************************************************************/ - -// https://github.com/chrisaljoudi/uBlock/issues/1038 -// Single asterisk will match any URL. - -// https://github.com/gorhill/uBlock/issues/2781 -// For efficiency purpose, try to extract a token from a regex-based filter. - -FilterParser.prototype.makeToken = function() { - if ( this.isRegex ) { - this.extractTokenFromRegex(); - return; - } - - if ( this.f === '*' ) { return; } - - let matches = this.findFirstGoodToken(); - if ( matches !== null ) { + if ( this.f === '*' ) { return; } + const matches = this.findGoodToken(); + if ( matches === null ) { return; } this.token = matches[0]; - this.tokenHash = µb.urlTokenizer.tokenHashFromString(this.token); + this.tokenHash = urlTokenizer.tokenHashFromString(this.token); this.tokenBeg = matches.index; } + + findGoodToken() { + this.reGoodToken.lastIndex = 0; + const s = this.f; + let bestMatch = null; + let bestBadness = 0; + let match; + while ( (match = this.reGoodToken.exec(s)) !== null ) { + const token = match[0]; + // https://github.com/gorhill/uBlock/issues/997 + // Ignore token if preceded by wildcard. + const pos = match.index; + if ( + pos !== 0 && + s.charCodeAt(pos - 1) === 0x2A /* '*' */ || + token.length < this.maxTokenLen && + s.charCodeAt(pos + token.length) === 0x2A /* '*' */ + ) { + continue; + } + // A one-char token is better than a documented bad token. + const badness = token.length > 1 + ? this.badTokens.get(token) || 0 + : 1; + if ( badness === 0 ) { return match; } + if ( bestBadness === 0 || badness < bestBadness ) { + bestMatch = match; + bestBadness = badness; + } + } + return bestMatch; + } + + // https://github.com/gorhill/uBlock/issues/2781 + // For efficiency purpose, try to extract a token from + // a regex-based filter. + extractTokenFromRegex() { + this.reRegexToken.lastIndex = 0; + const s = this.f; + let matches; + while ( (matches = this.reRegexToken.exec(s)) !== null ) { + const prefix = s.slice(0, matches.index); + if ( this.reRegexTokenAbort.test(prefix) ) { return; } + if ( + this.reRegexBadPrefix.test(prefix) || + this.reRegexBadSuffix.test(s.slice(this.reRegexToken.lastIndex)) + ) { + continue; + } + this.token = matches[0].toLowerCase(); + this.tokenHash = urlTokenizer.tokenHashFromString(this.token); + this.tokenBeg = matches.index; + if ( this.badTokens.has(this.token) === false ) { break; } + } + } + + isJustOrigin() { + return this.isRegex === false && + this.dataType === undefined && + this.domainOpt !== '' && ( + this.f === '*' || ( + this.anchor === 0b010 && + /^(?:http[s*]?:(?:\/\/)?)$/.test(this.f) + ) + ) && + this.domainOpt.indexOf('~') === -1; + } }; /******************************************************************************/ -FilterParser.prototype.isJustOrigin = function() { - return this.dataType === undefined && - this.domainOpt !== '' && - /^(?:\*|http[s*]?:(?:\/\/)?)$/.test(this.f) && - this.domainOpt.indexOf('~') === -1; -}; +FilterParser.parse = (( ) => { + let parser; + let last = 0; + let ttlTimer; + + const ttlProcess = ( ) => { + ttlTimer = undefined; + if ( (Date.now() - last) > 10000 ) { + parser = undefined; + return; + } + ttlTimer = vAPI.setTimeout(ttlProcess, 10007); + }; + + return s => { + if ( parser === undefined ) { + parser = new FilterParser(); + } + last = Date.now(); + if ( ttlTimer === undefined ) { + ttlTimer = vAPI.setTimeout(ttlProcess, 10007); + } + return parser.parse(s); + }; +})(); /******************************************************************************/ /******************************************************************************/ const FilterContainer = function() { - this.filterParser = new FilterParser(); - this.urlTokenizer = µb.urlTokenizer; - this.noTokenHash = this.urlTokenizer.noTokenHash; - this.dotTokenHash = this.urlTokenizer.dotTokenHash; - this.anyTokenHash = this.urlTokenizer.anyTokenHash; - this.anyHTTPSTokenHash = this.urlTokenizer.anyHTTPSTokenHash; - this.anyHTTPTokenHash = this.urlTokenizer.anyHTTPTokenHash; + this.noTokenHash = urlTokenizer.noTokenHash; + this.dotTokenHash = urlTokenizer.dotTokenHash; + this.anyTokenHash = urlTokenizer.anyTokenHash; + this.anyHTTPSTokenHash = urlTokenizer.anyHTTPSTokenHash; + this.anyHTTPTokenHash = urlTokenizer.anyHTTPTokenHash; this.reset(); }; /******************************************************************************/ -// Reset all, thus reducing to a minimum memory footprint of the context. - FilterContainer.prototype.reset = function() { this.frozen = false; this.processedFilterCount = 0; @@ -2408,28 +2596,30 @@ FilterContainer.prototype.reset = function() { this.goodFilters = new Set(); this.badFilters = new Set(); this.categories = new Map(); - this.dataFilters = new Map(); - this.filterParser.reset(); - this.urlTokenizer.resetKnownTokens(); + + urlTokenizer.resetKnownTokens(); // This will invalidate all tries FilterHostnameDict.reset(); filterOrigin.reset(); - FilterBucket.reset(); + bidiTrie.reset(); + + filterUnits = filterUnits.slice(0, FILTER_UNITS_MIN); + filterSequenceWritePtr = FILTER_SEQUENCES_MIN; // Runtime registers this.$catbits = 0; this.$tokenHash = 0; - this.$filter = null; + this.$filterUnit = 0; }; /******************************************************************************/ FilterContainer.prototype.freeze = function() { - const filterPairId = FilterPair.fid; const filterBucketId = FilterBucket.fid; const redirectTypeValue = typeNameToTypeValue.redirect; const unserialize = µb.CompiledLineIO.unserialize; + const units = filterUnits; for ( const line of this.goodFilters ) { if ( this.badFilters.has(line) ) { @@ -2456,95 +2646,87 @@ FilterContainer.prototype.freeze = function() { bucket = new Map(); this.categories.set(bits, bucket); } - let entry = bucket.get(tokenHash); + let iunit = bucket.get(tokenHash); if ( tokenHash === this.dotTokenHash ) { - if ( entry === undefined ) { - entry = new FilterHostnameDict(); - bucket.set(this.dotTokenHash, entry); + if ( iunit === undefined ) { + iunit = filterFromCtor(FilterHostnameDict); + bucket.set(this.dotTokenHash, iunit); } - entry.add(fdata); + units[iunit].add(fdata); continue; } if ( tokenHash === this.anyTokenHash ) { - if ( entry === undefined ) { - entry = new FilterJustOrigin(); - bucket.set(this.anyTokenHash, entry); + if ( iunit === undefined ) { + iunit = filterFromCtor(FilterJustOrigin); + bucket.set(this.anyTokenHash, iunit); } - entry.add(fdata); + units[iunit].add(fdata); continue; } if ( tokenHash === this.anyHTTPSTokenHash ) { - if ( entry === undefined ) { - entry = new FilterHTTPSJustOrigin(); - bucket.set(this.anyHTTPSTokenHash, entry); + if ( iunit === undefined ) { + iunit = filterFromCtor(FilterHTTPSJustOrigin); + bucket.set(this.anyHTTPSTokenHash, iunit); } - entry.add(fdata); + units[iunit].add(fdata); continue; } if ( tokenHash === this.anyHTTPTokenHash ) { - if ( entry === undefined ) { - entry = new FilterHTTPJustOrigin(); - bucket.set(this.anyHTTPTokenHash, entry); + if ( iunit === undefined ) { + iunit = filterFromCtor(FilterHTTPJustOrigin); + bucket.set(this.anyHTTPTokenHash, iunit); } - entry.add(fdata); + units[iunit].add(fdata); continue; } - this.urlTokenizer.addKnownToken(tokenHash); + urlTokenizer.addKnownToken(tokenHash); - if ( entry === undefined ) { - bucket.set(tokenHash, filterFromCompiledData(fdata)); + const inewunit = filterUnitFromCompiled(fdata); + + if ( iunit === undefined ) { + bucket.set(tokenHash, inewunit); continue; } - if ( entry.fid === filterBucketId ) { - entry.add(fdata); + let f = units[iunit]; + if ( f.fid === filterBucketId ) { + f.unshift(inewunit); continue; } - if ( entry.fid === filterPairId ) { - bucket.set( - tokenHash, - entry.upgrade(filterFromCompiledData(fdata)) - ); - continue; - } - bucket.set( - tokenHash, - new FilterPair(entry, filterFromCompiledData(fdata)) - ); + const ibucketunit = filterFromCtor(FilterBucket); + f = units[ibucketunit]; + f.unshift(iunit); + f.unshift(inewunit); + bucket.set(tokenHash, ibucketunit); } - this.filterParser.reset(); this.badFilters.clear(); this.goodFilters.clear(); + + for ( const bucket of this.categories.values() ) { + for ( const iunit of bucket.values() ) { + const f = units[iunit]; + if ( f instanceof FilterBucket === false ) { continue; } + f.optimize(); + } + } + FilterHostnameDict.optimize(); - FilterBucket.optimize(); + bidiTrieOptimize(); this.frozen = true; }; /******************************************************************************/ -// This is necessary for when the filtering engine readiness will depend -// on asynchronous operations (ex.: when loading a wasm module). - -FilterContainer.prototype.readyToUse = function() { - return Promise.resolve(); -}; - -/******************************************************************************/ - FilterContainer.prototype.toSelfie = function(path) { - const categoriesToSelfie = function(categoryMap) { + const categoriesToSelfie = ( ) => { const selfie = []; - for ( const [ catbits, bucket ] of categoryMap ) { - const tokenEntries = []; - for ( const [ token, filter ] of bucket ) { - tokenEntries.push([ token, filter.compile(true) ]); - } - selfie.push([ catbits, tokenEntries ]); + for ( const [ catbits, bucket ] of this.categories ) { + selfie.push([ catbits, Array.from(bucket) ]); } return selfie; }; @@ -2552,19 +2734,26 @@ FilterContainer.prototype.toSelfie = function(path) { filterOrigin.optimize(); return Promise.all([ - µBlock.assets.put( + µb.assets.put( `${path}/FilterHostnameDict.trieContainer`, - FilterHostnameDict.trieContainer.serialize(µBlock.base64) + FilterHostnameDict.trieContainer.serialize(µb.base64) ), - µBlock.assets.put( + µb.assets.put( `${path}/FilterOrigin.trieContainer`, - filterOrigin.trieContainer.serialize(µBlock.base64) + filterOrigin.trieContainer.serialize(µb.base64) ), - µBlock.assets.put( - `${path}/FilterBucket.trieContainer`, - FilterBucket.trieContainer.serialize(µBlock.base64) + µb.assets.put( + `${path}/bidiTrie`, + bidiTrie.serialize(µb.base64) ), - µBlock.assets.put( + µb.assets.put( + `${path}/filterSequences`, + µb.base64.encode( + filterSequences.buffer, + filterSequenceWritePtr << 2 + ) + ), + µb.assets.put( `${path}/main`, JSON.stringify({ processedFilterCount: this.processedFilterCount, @@ -2573,9 +2762,11 @@ FilterContainer.prototype.toSelfie = function(path) { allowFilterCount: this.allowFilterCount, blockFilterCount: this.blockFilterCount, discardedCount: this.discardedCount, - categories: categoriesToSelfie(this.categories), - urlTokenizer: this.urlTokenizer.toSelfie(), - filterOriginStrSlots: filterOrigin.strSlots, + categories: categoriesToSelfie(), + urlTokenizer: urlTokenizer.toSelfie(), + filterUnits: filterUnits.map(f => + f !== null ? f.toSelfie() : null + ), }) ) ]); @@ -2585,25 +2776,36 @@ FilterContainer.prototype.toSelfie = function(path) { FilterContainer.prototype.fromSelfie = function(path) { return Promise.all([ - µBlock.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details => + µb.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details => FilterHostnameDict.trieContainer.unserialize( details.content, - µBlock.base64 + µb.base64 ) ), - µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details => + µb.assets.get(`${path}/FilterOrigin.trieContainer`).then(details => filterOrigin.trieContainer.unserialize( details.content, - µBlock.base64 + µb.base64 ) ), - µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details => - FilterBucket.trieContainer.unserialize( + µb.assets.get(`${path}/bidiTrie`).then(details => + bidiTrie.unserialize( details.content, - µBlock.base64 + µb.base64 ) ), - µBlock.assets.get(`${path}/main`).then(details => { + µb.assets.get(`${path}/filterSequences`).then(details => { + const size = µb.base64.decodeSize(details.content) >> 2; + if ( size === 0 ) { return false; } + filterSequenceBufferResize(size); + filterSequences = µb.base64.decode( + details.content, + filterSequences.buffer + ); + filterSequenceWritePtr = size; + return true; + }), + µb.assets.get(`${path}/main`).then(details => { let selfie; try { selfie = JSON.parse(details.content); @@ -2617,14 +2819,12 @@ FilterContainer.prototype.fromSelfie = function(path) { this.allowFilterCount = selfie.allowFilterCount; this.blockFilterCount = selfie.blockFilterCount; this.discardedCount = selfie.discardedCount; - this.urlTokenizer.fromSelfie(selfie.urlTokenizer); - filterOrigin.strSlots = selfie.filterOriginStrSlots; + urlTokenizer.fromSelfie(selfie.urlTokenizer); + filterUnits = selfie.filterUnits.map(f => + f !== null ? filterFromSelfie(f) : null + ); for ( const [ catbits, bucket ] of selfie.categories ) { - const tokenMap = new Map(); - for ( const [ token, fdata ] of bucket ) { - tokenMap.set(token, filterFromCompiledData(fdata)); - } - this.categories.set(catbits, tokenMap); + this.categories.set(catbits, new Map(bucket)); } return true; }), @@ -2638,16 +2838,10 @@ FilterContainer.prototype.fromSelfie = function(path) { FilterContainer.prototype.compile = function(raw, writer) { // ORDER OF TESTS IS IMPORTANT! - // Ignore empty lines - const s = raw.trim(); - if ( s.length === 0 ) { return false; } + const parsed = FilterParser.parse(raw); - const parsed = this.filterParser.parse(s); - - // Ignore element-hiding filters - if ( parsed.elemHiding ) { - return false; - } + // Ignore non-static network filters + if ( parsed.invalid ) { return false; } // Ignore filters with unsupported options if ( parsed.unsupported ) { @@ -2672,9 +2866,7 @@ FilterContainer.prototype.compile = function(raw, writer) { }); return false; } - if ( parsed.redirect === 2 ) { - return true; - } + if ( parsed.redirect === 2 ) { return true; } } // Pure hostnames, use more efficient dictionary lookup @@ -2692,68 +2884,72 @@ FilterContainer.prototype.compile = function(raw, writer) { parsed.makeToken(); - let fdata; - if ( parsed.isRegex ) { - fdata = FilterRegex.compile(parsed); - } else if ( parsed.isPureHostname ) { - fdata = FilterPlainHostname.compile(parsed); - } else if ( parsed.f === '*' ) { - if ( parsed.isJustOrigin() ) { + const units = []; + + // Pattern + if ( parsed.isPureHostname ) { + parsed.anchor = 0; + units.push(FilterPlainHostname.compile(parsed)); + } else if ( parsed.isJustOrigin() ) { + const hostnames = parsed.domainOpt.split('|'); + if ( parsed.f === '*' ) { parsed.tokenHash = this.anyTokenHash; - for ( const hn of parsed.domainOpt.split('|') ) { + for ( const hn of hostnames ) { this.compileToAtomicFilter(parsed, hn, writer); } return true; } - fdata = FilterTrue.compile(); - } else if ( parsed.anchor === 0x5 ) { - fdata = FilterGenericHnAndRightAnchored.compile(parsed); - } else if ( parsed.anchor === 0x4 ) { - if ( - parsed.wildcarded === false && - parsed.tokenHash !== parsed.noTokenHash - ) { - fdata = FilterPlainHnAnchored.compile(parsed); - } else { - fdata = FilterGenericHnAnchored.compile(parsed); + if ( parsed.f.startsWith('https') ) { + parsed.tokenHash = this.anyHTTPSTokenHash; + for ( const hn of hostnames ) { + this.compileToAtomicFilter(parsed, hn, writer); + } + return true; } - } else if ( parsed.anchor === 0x2 && parsed.isJustOrigin() ) { - const hostnames = parsed.domainOpt.split('|'); - const isHTTPS = parsed.f === 'https://' || parsed.f === 'http*://'; - const isHTTP = parsed.f === 'http://' || parsed.f === 'http*://'; + parsed.tokenHash = this.anyHTTPTokenHash; for ( const hn of hostnames ) { - if ( isHTTPS ) { - parsed.tokenHash = this.anyHTTPSTokenHash; - this.compileToAtomicFilter(parsed, hn, writer); - } - if ( isHTTP ) { - parsed.tokenHash = this.anyHTTPTokenHash; - this.compileToAtomicFilter(parsed, hn, writer); - } + this.compileToAtomicFilter(parsed, hn, writer); } return true; - } else if ( parsed.wildcarded || parsed.tokenHash === parsed.noTokenHash ) { - fdata = FilterGeneric.compile(parsed); - } else if ( parsed.anchor === 0x2 ) { - fdata = FilterPlainLeftAnchored.compile(parsed); - } else if ( parsed.anchor === 0x1 ) { - fdata = FilterPlainRightAnchored.compile(parsed); - } else if ( parsed.anchor === 0x3 ) { - fdata = FilterExactMatch.compile(parsed); } else { - fdata = FilterPlain.compile(parsed); + filterPattern.compile(parsed, units); } + // Type + // EXPERIMENT: $requestTypeBit + //if ( (parsed.typeBits & allNetworkTypesBits) !== 0 ) { + // units.unshift(FilterType.compile(parsed)); + // parsed.typeBits &= ~allNetworkTypesBits; + //} + + // Anchor + if ( (parsed.anchor & 0b100) !== 0 ) { + units.push(FilterAnchorHn.compile()); + } else if ( (parsed.anchor & 0b010) !== 0 ) { + units.push(FilterAnchorLeft.compile()); + } + if ( (parsed.anchor & 0b001) !== 0 ) { + units.push(FilterAnchorRight.compile()); + } + + // Origin if ( parsed.domainOpt !== '' ) { - fdata = filterOrigin.compile(parsed, fdata); + filterOrigin.compile( + parsed, + units.length !== 0 && filterClasses[units[0][0]].isSlow === true, + units + ); } + // Data if ( parsed.dataType !== undefined ) { - let fwrapped = fdata; - fdata = FilterDataHolder.compile(parsed); - fdata.push(fwrapped); + units.push(FilterDataHolder.compile(parsed)); } + const fdata = units.length === 1 + ? units[0] + : FilterComposite.compile(units); + this.compileToAtomicFilter(parsed, fdata, writer); return true; @@ -2766,13 +2962,12 @@ FilterContainer.prototype.compileToAtomicFilter = function( fdata, writer ) { - // 0 = network filters // 1 = network filters: bad filters writer.select(parsed.badFilter ? 1 : 0); const descBits = parsed.action | parsed.important | parsed.party; - let typeBits = parsed.types; + let typeBits = parsed.typeBits; // Typeless if ( typeBits === 0 ) { @@ -2790,7 +2985,11 @@ FilterContainer.prototype.compileToAtomicFilter = function( let bitOffset = 1; do { if ( typeBits & 1 ) { - writer.push([ descBits | (bitOffset << 4), parsed.tokenHash, fdata ]); + writer.push( + [ descBits | (bitOffset << 4), + parsed.tokenHash, + fdata + ]); } bitOffset += 1; typeBits >>>= 1; @@ -2849,32 +3048,40 @@ FilterContainer.prototype.realmMatchAndFetchData = function( if ( bucket01 === undefined && bucket11 === undefined ) { return false; } - const url = $requestURL; - const tokenHashes = this.urlTokenizer.getTokens(); + const units = filterUnits; + const tokenHashes = urlTokenizer.getTokens(bidiTrie); const filters = []; - let i = 0, tokenBeg = 0, f; + let i = 0, iunit, f; for (;;) { const th = tokenHashes[i]; if ( th === 0 ) { return; } - tokenBeg = tokenHashes[i+1]; + $tokenBeg = tokenHashes[i+1]; if ( (bucket01 !== undefined) && - (f = bucket01.get(th)) !== undefined + (iunit = bucket01.get(th)) !== undefined ) { + f = units[iunit]; filters.length = 0; - f.matchAndFetchData(type, url, tokenBeg, filters); + f.matchAndFetchData(type, filters); for ( f of filters ) { - out.set(f.data, new FilterDataHolderResult(bits01, th, f)); + out.set( + f.getData(type), + new FilterDataHolderResult(bits01, th, iunit) + ); } } if ( (bucket11 !== undefined) && - (f = bucket11.get(th)) !== undefined + (iunit = bucket11.get(th)) !== undefined ) { + f = units[iunit]; filters.length = 0; - f.matchAndFetchData(type, url, tokenBeg, filters); + f.matchAndFetchData(type, filters); for ( f of filters ) { - out.set(f.data, new FilterDataHolderResult(bits11, th, f)); + out.set( + f.getData(type), + new FilterDataHolderResult(bits11, th, iunit) + ); } } i += 2; @@ -2884,7 +3091,7 @@ FilterContainer.prototype.realmMatchAndFetchData = function( /******************************************************************************/ FilterContainer.prototype.matchAndFetchData = function(fctxt, type) { - $requestURL = this.urlTokenizer.setURL(fctxt.url); + $requestURL = urlTokenizer.setURL(fctxt.url); $docHostname = fctxt.getDocHostname(); $requestHostname = fctxt.getHostname(); @@ -2973,72 +3180,72 @@ FilterContainer.prototype.realmMatchString = function( return false; } - let catBits = 0, f; + const units = filterUnits; + let catBits = 0, iunit = 0; // Pure hostname-based filters let tokenHash = this.dotTokenHash; if ( (bucket00 !== undefined) && - (f = bucket00.get(tokenHash)) !== undefined && - (f.match() === true) + (iunit = bucket00.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits00; } else if ( (bucket01 !== undefined) && - (f = bucket01.get(tokenHash)) !== undefined && - (f.match() === true) + (iunit = bucket01.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits01; } else if ( (bucket10 !== undefined) && - (f = bucket10.get(tokenHash)) !== undefined && - (f.match() === true) + (iunit = bucket10.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits10; } else if ( (bucket11 !== undefined) && - (f = bucket11.get(tokenHash)) !== undefined && - (f.match() === true) + (iunit = bucket11.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits11; } // Pattern-based filters else { - const url = $requestURL; - const tokenHashes = this.urlTokenizer.getTokens(); - let i = 0, tokenBeg = 0; + const tokenHashes = urlTokenizer.getTokens(bidiTrie); + let i = 0; for (;;) { tokenHash = tokenHashes[i]; if ( tokenHash === 0 ) { return false; } - tokenBeg = tokenHashes[i+1]; + $tokenBeg = tokenHashes[i+1]; if ( (bucket00 !== undefined) && - (f = bucket00.get(tokenHash)) !== undefined && - (f.match(url, tokenBeg) === true) + (iunit = bucket00.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits00; break; } if ( (bucket01 !== undefined) && - (f = bucket01.get(tokenHash)) !== undefined && - (f.match(url, tokenBeg) === true) + (iunit = bucket01.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits01; break; } if ( (bucket10 !== undefined) && - (f = bucket10.get(tokenHash)) !== undefined && - (f.match(url, tokenBeg) === true) + (iunit = bucket10.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits10; break; } if ( (bucket11 !== undefined) && - (f = bucket11.get(tokenHash)) !== undefined && - (f.match(url, tokenBeg) === true) + (iunit = bucket11.get(tokenHash) || 0) !== 0 && + (units[iunit].match() === true) ) { catBits = catBits11; break; @@ -3049,7 +3256,7 @@ FilterContainer.prototype.realmMatchString = function( this.$catbits = catBits; this.$tokenHash = tokenHash; - this.$filter = f; + this.$filterUnit = iunit; return true; }; @@ -3070,8 +3277,8 @@ FilterContainer.prototype.matchStringElementHide = function(type, url) { const typeBits = typeNameToTypeValue[`${type}hide`] | 0x80000000; // Prime tokenizer: we get a normalized URL in return. - $requestURL = this.urlTokenizer.setURL(url); - this.$filter = null; + $requestURL = urlTokenizer.setURL(url); + this.$filterUnit = 0; // These registers will be used by various filters $docHostname = $requestHostname = µb.URI.hostnameFromURI(url); @@ -3098,37 +3305,39 @@ FilterContainer.prototype.matchStringElementHide = function(type, url) { // support unknown types. FilterContainer.prototype.matchString = function(fctxt, modifiers = 0) { - let typeBits = typeNameToTypeValue[fctxt.type]; + let typeValue = typeNameToTypeValue[fctxt.type]; if ( modifiers === 0 ) { - if ( typeBits === undefined ) { - typeBits = otherTypeBitValue; - } else if ( typeBits === 0 || typeBits > otherTypeBitValue ) { + if ( typeValue === undefined ) { + typeValue = otherTypeBitValue; + } else if ( typeValue === 0 || typeValue > otherTypeBitValue ) { modifiers |= 0b0001; } } + // EXPERIMENT: $requestTypeBit + //$requestTypeBit = 1 << ((typeValue >>> 4) - 1); if ( (modifiers & 0b0001) !== 0 ) { - if ( typeBits === undefined ) { return 0; } - typeBits |= 0x80000000; + if ( typeValue === undefined ) { return 0; } + typeValue |= 0x80000000; } const partyBits = fctxt.is3rdPartyToDoc() ? ThirdParty : FirstParty; // Prime tokenizer: we get a normalized URL in return. - $requestURL = this.urlTokenizer.setURL(fctxt.url); - this.$filter = null; + $requestURL = urlTokenizer.setURL(fctxt.url); + this.$filterUnit = 0; // These registers will be used by various filters $docHostname = fctxt.getDocHostname(); $requestHostname = fctxt.getHostname(); // Important block filters. - if ( this.realmMatchString(BlockImportant, typeBits, partyBits) ) { + if ( this.realmMatchString(BlockImportant, typeValue, partyBits) ) { return 1; } // Block filters - if ( this.realmMatchString(BlockAction, typeBits, partyBits) ) { + if ( this.realmMatchString(BlockAction, typeValue, partyBits) ) { // Exception filters - if ( this.realmMatchString(AllowAction, typeBits, partyBits) ) { + if ( this.realmMatchString(AllowAction, typeValue, partyBits) ) { return 2; } return 1; @@ -3139,21 +3348,17 @@ FilterContainer.prototype.matchString = function(fctxt, modifiers = 0) { /******************************************************************************/ FilterContainer.prototype.toLogData = function() { - if ( this.$filter === null ) { return; } + if ( this.$filterUnit === 0 ) { return; } const logData = toLogDataInternal( this.$catbits, this.$tokenHash, - this.$filter + this.$filterUnit ); logData.source = 'static'; logData.tokenHash = this.$tokenHash; - logData.result = this.$filter === null + logData.result = this.$filterUnit === 0 ? 0 - : ( - (this.$catbits & 1) !== 0 - ? 2 - : 1 - ); + : ((this.$catbits & 1) !== 0 ? 2 : 1); return logData; }; @@ -3167,7 +3372,7 @@ FilterContainer.prototype.getFilterCount = function() { // action: 1=test, 2=record -FilterContainer.prototype.benchmark = async function(action) { +FilterContainer.prototype.benchmark = async function(action, target) { const requests = await µb.loadBenchmarkDataset(); if ( Array.isArray(requests) === false || requests.length === 0 ) { @@ -3177,6 +3382,20 @@ FilterContainer.prototype.benchmark = async function(action) { console.info(`Benchmarking staticNetFilteringEngine.matchString()...`); const fctxt = µb.filteringContext.duplicate(); + + if ( typeof target === 'number' ) { + const request = requests[target]; + fctxt.setURL(request.url); + fctxt.setDocOriginFromURL(request.frameUrl); + fctxt.setType(request.cpt); + const r = this.matchString(fctxt); + console.log(`Result=${r}:`); + console.log(`\ttype=${fctxt.type}`); + console.log(`\turl=${fctxt.url}`); + console.log(`\tdocOrigin=${fctxt.getDocOrigin()}`); + return; + } + let expected, recorded; if ( action === 1 ) { try { @@ -3199,7 +3418,7 @@ FilterContainer.prototype.benchmark = async function(action) { const r = this.matchString(fctxt); if ( recorded !== undefined ) { recorded.push(r); } if ( expected !== undefined && r !== expected[i] ) { - console.log('Mismatch with reference results:'); + console.log(`Mismatch with reference results at ${i}:`); console.log(`\tExpected ${expected[i]}, got ${r}:`); console.log(`\ttype=${fctxt.type}`); console.log(`\turl=${fctxt.url}`); @@ -3273,29 +3492,25 @@ FilterContainer.prototype.test = function(docURL, type, url) { */ FilterContainer.prototype.bucketHistogram = function() { + const units = filterUnits; const results = []; for ( const [ bits, category ] of this.categories ) { - for ( const [ th, f ] of category ) { - if ( f instanceof FilterPair ) { - const token = µBlock.urlTokenizer.stringFromTokenHash(th); - results.push({ bits: bits.toString(16), token, size: f.size, f }); - continue; - } + for ( const [ th, iunit ] of category ) { + const token = urlTokenizer.stringFromTokenHash(th); + const f = units[iunit]; if ( f instanceof FilterBucket ) { - const token = µBlock.urlTokenizer.stringFromTokenHash(th); results.push({ bits: bits.toString(16), token, size: f.size, f }); continue; } if ( f instanceof FilterHostnameDict ) { - const token = µBlock.urlTokenizer.stringFromTokenHash(th); results.push({ bits: bits.toString(16), token, size: f.size, f }); continue; } if ( f instanceof FilterJustOrigin ) { - const token = µBlock.urlTokenizer.stringFromTokenHash(th); results.push({ bits: bits.toString(16), token, size: f.size, f }); continue; } + results.push({ bits: bits.toString(16), token, size: 1, f }); } } results.sort((a, b) => { @@ -3308,36 +3523,6 @@ FilterContainer.prototype.bucketHistogram = function() { With default filter lists: - As of 2019-04-13: - - {"FilterPlainHnAnchored" => 12619} - {"FilterPlainPrefix1" => 8743} - {"FilterGenericHnAnchored" => 5231} - {"FilterOriginHit" => 4149} - {"FilterPair" => 2381} - {"FilterBucket" => 1940} - {"FilterPlainHostname" => 1612} - {"FilterOriginHitSet" => 1430} - {"FilterPlainLeftAnchored" => 799} - {"FilterGeneric" => 588} - {"FilterPlain" => 510} - {"FilterOriginMiss" => 299} - {"FilterDataHolder" => 280} - {"FilterOriginMissSet" => 150} - {"FilterTrue" => 130} - {"FilterRegex" => 124} - {"FilterPlainRightAnchored" => 110} - {"FilterGenericHnAndRightAnchored" => 95} - {"FilterHostnameDict" => 59} - {"FilterPlainPrefix0" => 29} - {"FilterExactMatch" => 5} - {"FilterOriginMixedSet" => 3} - - Observations: - - No need for FilterPlainPrefix0. - - FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates - for storing in a plain string trie. - As of 2019-04-25: {"FilterPlainHnAnchored" => 11078} @@ -3368,44 +3553,71 @@ FilterContainer.prototype.bucketHistogram = function() { {"FilterExactMatch" => 5} {"FilterOriginMixedSet" => 3} + As of 2019-10-21: + + "FilterPatternPlain" => 27542} + "FilterComposite" => 17249} + "FilterPlainTrie" => 13235} + "FilterAnchorHn" => 11938} + "FilterPatternRightEx" => 4446} + "FilterOriginHit" => 4435} + "FilterBucket" => 3833} + "FilterPatternRight" => 3426} + "FilterPlainHostname" => 2786} + "FilterOriginHitSet" => 1433} + "FilterDataHolder" => 666} + "FilterPatternGeneric" => 548} + "FilterOriginMiss" => 441} + "FilterOriginMissSet" => 208} + "FilterTrailingSeparator" => 188} + "FilterRegex" => 181} + "FilterPatternLeft" => 172} + "FilterAnchorRight" => 100} + "FilterPatternLeftEx" => 82} + "FilterHostnameDict" => 60} + "FilterAnchorLeft" => 50} + "FilterJustOrigin" => 24} + "FilterHTTPJustOrigin" => 18} + "FilterTrue" => 17} + "FilterHTTPSJustOrigin" => 17} + */ FilterContainer.prototype.filterClassHistogram = function() { const filterClassDetails = new Map(); - for ( let i = 0; i < filterClasses.length; i++ ) { - filterClassDetails.set(i, { name: filterClasses[i].name, count: 0, }); + for ( const fclass of filterClasses ) { + filterClassDetails.set(fclass.fid, { name: fclass.name, count: 0, }); } // Artificial classes to report content of tries filterClassDetails.set(1000, { name: 'FilterPlainTrie', count: 0, }); - filterClassDetails.set(1001, { name: 'FilterPlainHnAnchoredTrie', count: 0, }); const countFilter = function(f) { if ( f instanceof Object === false ) { return; } filterClassDetails.get(f.fid).count += 1; - if ( f.wrapped ) { - countFilter(f.wrapped); - } }; - for ( const category of this.categories.values() ) { - for ( const f of category.values() ) { - countFilter(f); - if ( f instanceof FilterBucket ) { - for ( const g of f.filters ) { countFilter(g); } - if ( f.plainTrie !== null ) { - filterClassDetails.get(1000).count += f.plainTrie.size; - } - if ( f.plainHnAnchoredTrie !== null ) { - filterClassDetails.get(1001).count += f.plainHnAnchoredTrie.size; - } - continue; + for ( const f of filterUnits ) { + if ( f === null ) { continue; } + countFilter(f); + if ( f instanceof FilterBucket ) { + let i = f.i; + while ( i !== 0 ) { + countFilter(filterUnits[filterSequences[i+0]]); + i = filterSequences[i+1]; } - if ( f instanceof FilterPair ) { - countFilter(f.f1); - countFilter(f.f2); - continue; + if ( f.plainTrie !== null ) { + filterClassDetails.get(1000).count += f.plainTrie.size; } + continue; + } + if ( f instanceof FilterComposite ) { + let i = f.i; + while ( i !== 0 ) { + countFilter(filterUnits[filterSequences[i+0]]); + i = filterSequences[i+1]; + } + continue; } } const results = Array.from(filterClassDetails.values()).sort((a, b) => { @@ -3416,6 +3628,48 @@ FilterContainer.prototype.filterClassHistogram = function() { /******************************************************************************/ +FilterContainer.prototype.tokenHistograms = async function() { + const requests = await µb.loadBenchmarkDataset(); + + if ( Array.isArray(requests) === false || requests.length === 0 ) { + console.info('No requests found to benchmark'); + return; + } + + console.info(`Computing token histograms...`); + const fctxt = µb.filteringContext.duplicate(); + + const missTokenMap = new Map(); + const hitTokenMap = new Map(); + const reTokens = /[0-9a-z%]{2,}/g; + + for ( let i = 0; i < requests.length; i++ ) { + const request = requests[i]; + fctxt.setURL(request.url); + fctxt.setDocOriginFromURL(request.frameUrl); + fctxt.setType(request.cpt); + const r = this.matchString(fctxt); + for ( let [ keyword ] of request.url.toLowerCase().matchAll(reTokens) ) { + const token = keyword; + if ( r === 0 ) { + missTokenMap.set(token, (missTokenMap.get(token) || 0) + 1); + } else if ( r === 1 ) { + hitTokenMap.set(token, (hitTokenMap.get(token) || 0) + 1); + } + } + } + const customSort = (a, b) => b[1] - a[1]; + const topmisses = Array.from(missTokenMap).sort(customSort).slice(0, 100); + for ( const [ token ] of topmisses ) { + hitTokenMap.delete(token); + } + const tophits = Array.from(hitTokenMap).sort(customSort).slice(0, 100); + console.log('Misses:', JSON.stringify(topmisses)); + console.log('Hits:', JSON.stringify(tophits)); +}; + +/******************************************************************************/ + return new FilterContainer(); /******************************************************************************/ diff --git a/src/js/storage.js b/src/js/storage.js index 3e8c7f9ad..e78ef1cab 100644 --- a/src/js/storage.js +++ b/src/js/storage.js @@ -1101,6 +1101,7 @@ catch (reason) { log.info(reason); } + destroy(); return false; }; diff --git a/src/js/strie.js b/src/js/strie.js index 88ec4badc..94f9146c0 100644 --- a/src/js/strie.js +++ b/src/js/strie.js @@ -31,12 +31,13 @@ A BidiTrieContainer is mostly a large buffer in which distinct but related tries are stored. The memory layout of the buffer is as follow: - 0-255: reserved - 256-259: offset to start of trie data section (=> trie0) - 260-263: offset to end of trie data section (=> trie1) - 264-267: offset to start of character data section (=> char0) - 268-271: offset to end of character data section (=> char1) - 272: start of trie data section + 0-2047: haystack section + 2048-2051: number of significant characters in the haystack + 2052-2055: offset to start of trie data section (=> trie0) + 2056-2059: offset to end of trie data section (=> trie1) + 2060-2063: offset to start of character data section (=> char0) + 2064-2067: offset to end of character data section (=> char1) + 2068: start of trie data section +--------------+ Normal cell: | And | If "Segment info" matches: @@ -99,35 +100,56 @@ */ -const PAGE_SIZE = 65536; - // i32 / i8 -const TRIE0_SLOT = 256 >>> 2; // 64 / 256 -const TRIE1_SLOT = TRIE0_SLOT + 1; // 65 / 260 -const CHAR0_SLOT = TRIE0_SLOT + 2; // 66 / 264 -const CHAR1_SLOT = TRIE0_SLOT + 3; // 67 / 268 -const TRIE0_START = TRIE0_SLOT + 4 << 2; // 272 +const PAGE_SIZE = 65536*2; + // i32 / i8 +const HAYSTACK_START = 0; +const HAYSTACK_SIZE = 2048; +const HAYSTACK_SIZE_SLOT = HAYSTACK_SIZE >>> 2; // 512 / 2048 +const TRIE0_SLOT = HAYSTACK_SIZE_SLOT + 1; // 512 / 2052 +const TRIE1_SLOT = HAYSTACK_SIZE_SLOT + 2; // 513 / 2056 +const CHAR0_SLOT = HAYSTACK_SIZE_SLOT + 3; // 514 / 2060 +const CHAR1_SLOT = HAYSTACK_SIZE_SLOT + 4; // 515 / 2064 +const TRIE0_START = HAYSTACK_SIZE_SLOT + 5 << 2; // 2068 +// TODO: need a few slots for result values if WASM-ing const CELL_BYTE_LENGTH = 12; -const MIN_FREE_CELL_BYTE_LENGTH = CELL_BYTE_LENGTH * 4; +const MIN_FREE_CELL_BYTE_LENGTH = CELL_BYTE_LENGTH * 8; const CELL_AND = 0; const CELL_OR = 1; -const BCELL_RIGHT_AND = 0; -const BCELL_LEFT_AND = 1; const SEGMENT_INFO = 2; +const BCELL_NEXT_AND = 0; +const BCELL_ALT_AND = 1; +const BCELL_EXTRA = 2; +const BCELL_EXTRA_MAX = 0x00FFFFFF; + +const toSegmentInfo = (aL, l, r) => ((r - l) << 24) | (aL + l); +const roundToPageSize = v => (v + PAGE_SIZE-1) & ~(PAGE_SIZE-1); µBlock.BidiTrieContainer = class { - constructor(details) { + constructor(details, extraHandler) { if ( details instanceof Object === false ) { details = {}; } - const len = (details.byteLength || 0) + PAGE_SIZE-1 & ~(PAGE_SIZE-1); - this.buf = new Uint8Array(Math.max(len, 131072)); - this.buf32 = new Uint32Array(this.buf.buffer); + const len = roundToPageSize(details.byteLength || 0); + const minInitialLen = PAGE_SIZE * 4; + this.buf8 = new Uint8Array(Math.max(len, minInitialLen)); + this.buf32 = new Uint32Array(this.buf8.buffer); this.buf32[TRIE0_SLOT] = TRIE0_START; this.buf32[TRIE1_SLOT] = this.buf32[TRIE0_SLOT]; - this.buf32[CHAR0_SLOT] = details.char0 || 65536; + this.buf32[CHAR0_SLOT] = details.char0 || (minInitialLen >>> 1); this.buf32[CHAR1_SLOT] = this.buf32[CHAR0_SLOT]; + this.haystack = this.buf8.subarray( + HAYSTACK_START, + HAYSTACK_START + HAYSTACK_SIZE + ); + this.haystackSize = 0; + this.extraHandler = extraHandler; + this.textDecoder = null; + + this.$l = 0; + this.$r = 0; + this.$iu = 0; } //-------------------------------------------------------------------------- @@ -139,16 +161,16 @@ const SEGMENT_INFO = 2; this.buf32[CHAR1_SLOT] = this.buf32[CHAR0_SLOT]; } - matches(iroot, a, i) { + matches(iroot, i) { const buf32 = this.buf32; - const buf8 = this.buf; + const buf8 = this.buf8; const char0 = buf32[CHAR0_SLOT]; - const aR = a.length; + const aR = this.haystackSize; let icell = iroot; let al = i; let c, v, bl, n; for (;;) { - c = a.charCodeAt(al); + c = buf8[al]; al += 1; // find first segment with a first-character match for (;;) { @@ -156,43 +178,51 @@ const SEGMENT_INFO = 2; bl = char0 + (v & 0x00FFFFFF); if ( buf8[bl] === c ) { break; } icell = buf32[icell+CELL_OR]; - if ( icell === 0 ) { return -1; } + if ( icell === 0 ) { return false; } } // all characters in segment must match n = v >>> 24; if ( n > 1 ) { n -= 1; - if ( (al + n) > aR ) { return -1; } + if ( (al + n) > aR ) { return false; } bl += 1; for ( let i = 0; i < n; i++ ) { - if ( a.charCodeAt(al+i) !== buf8[bl+i] ) { return -1; } + if ( buf8[al+i] !== buf8[bl+i] ) { return false; } } al += n; } // next segment icell = buf32[icell+CELL_AND]; - if ( /* icell === 0 || */ buf32[icell+SEGMENT_INFO] === 0 ) { - const inext = buf32[icell+BCELL_LEFT_AND]; - if ( inext === 0 ) { return (i << 16) | al; } - const r = this.matchesLeft(inext, a, i); - if ( r !== -1 ) { return (r << 16) | al; } - icell = buf32[icell+CELL_AND]; - if ( icell === 0 ) { return -1; } + const ix = buf32[icell+BCELL_EXTRA]; + if ( ix <= BCELL_EXTRA_MAX ) { + if ( ix !== 0 ) { + const iu = ix === 1 ? -1 : this.extraHandler(i, al, ix); + if ( iu !== 0 ) { + this.$l = i; this.$r = al; this.$iu = iu; return true; + } + } + let inext = buf32[icell+BCELL_ALT_AND]; + if ( inext !== 0 ) { + if ( this.matchesLeft(inext, i, al) ) { return true; } + } + inext = buf32[icell+BCELL_NEXT_AND]; + if ( inext === 0 ) { return false; } + icell = inext; } - if ( al === aR ) { return -1; } + if ( al === aR ) { return false; } } } - matchesLeft(iroot, a, i) { + matchesLeft(iroot, i, r) { const buf32 = this.buf32; - const buf8 = this.buf; + const buf8 = this.buf8; const char0 = buf32[CHAR0_SLOT]; let icell = iroot; let ar = i; let c, v, br, n; for (;;) { ar -= 1; - c = a.charCodeAt(ar); + c = buf8[ar]; // find first segment with a first-character match for (;;) { v = buf32[icell+SEGMENT_INFO]; @@ -200,21 +230,31 @@ const SEGMENT_INFO = 2; br = char0 + (v & 0x00FFFFFF) + n - 1; if ( buf8[br] === c ) { break; } icell = buf32[icell+CELL_OR]; - if ( icell === 0 ) { return -1; } + if ( icell === 0 ) { return false; } } // all characters in segment must match if ( n > 1 ) { n -= 1; - if ( n > ar ) { return -1; } + if ( n > ar ) { return false; } for ( let i = 1; i <= n; i++ ) { - if ( a.charCodeAt(ar-i) !== buf8[br-i] ) { return -1; } + if ( buf8[ar-i] !== buf8[br-i] ) { return false; } } ar -= n; } // next segment icell = buf32[icell+CELL_AND]; - if ( icell === 0 || buf32[icell+SEGMENT_INFO] === 0 ) { return ar; } - if ( ar === 0 ) { return -1; } + const ix = buf32[icell+BCELL_EXTRA]; + if ( ix <= BCELL_EXTRA_MAX ) { + if ( ix !== 0 ) { + const iu = ix === 1 ? -1 : this.extraHandler(ar, r, ix); + if ( iu !== 0 ) { + this.$l = ar; this.$r = r; this.$iu = iu; return true; + } + } + icell = buf32[icell+BCELL_NEXT_AND]; + if ( icell === 0 ) { return false; } + } + if ( ar === 0 ) { return false; } } } @@ -238,43 +278,47 @@ const SEGMENT_INFO = 2; return [ trieRef.iroot, trieRef.size ]; } - add(iroot, a, i = 0) { - const aR = a.length; + add(iroot, aL0, n, pivot = 0) { + const aR = n; if ( aR === 0 ) { return 0; } - // grow buffer if needed + // Grow buffer if needed. The characters are already in our character + // data buffer, so we do not need to grow character data buffer. if ( - (this.buf32[CHAR0_SLOT] - this.buf32[TRIE1_SLOT]) < MIN_FREE_CELL_BYTE_LENGTH || - (this.buf.length - this.buf32[CHAR1_SLOT]) < 256 + (this.buf32[CHAR0_SLOT] - this.buf32[TRIE1_SLOT]) < + MIN_FREE_CELL_BYTE_LENGTH ) { - this.growBuf(MIN_FREE_CELL_BYTE_LENGTH, 256); + this.growBuf(MIN_FREE_CELL_BYTE_LENGTH, 0); } const buf32 = this.buf32; + const char0 = buf32[CHAR0_SLOT]; let icell = iroot; + let aL = char0 + aL0; // special case: first node in trie if ( buf32[icell+SEGMENT_INFO] === 0 ) { - buf32[icell+SEGMENT_INFO] = this.addSegment(a, i, aR); - return this.addLeft(icell, a, i); + buf32[icell+SEGMENT_INFO] = toSegmentInfo(aL0, pivot, aR); + return this.addLeft(icell, aL0, pivot); } - const buf8 = this.buf; - const char0 = buf32[CHAR0_SLOT]; - let al = i; + const buf8 = this.buf8; + let al = pivot; let inext; // find a matching cell: move down for (;;) { const binfo = buf32[icell+SEGMENT_INFO]; + // length of segment + const bR = binfo >>> 24; // skip boundary cells - if ( binfo === 0 ) { - icell = buf32[icell+BCELL_RIGHT_AND]; + if ( bR === 0 ) { + icell = buf32[icell+BCELL_NEXT_AND]; continue; } let bl = char0 + (binfo & 0x00FFFFFF); // if first character is no match, move to next descendant - if ( buf8[bl] !== a.charCodeAt(al) ) { + if ( buf8[bl] !== buf8[aL+al] ) { inext = buf32[icell+CELL_OR]; if ( inext === 0 ) { - inext = this.addCell(0, 0, this.addSegment(a, al, aR)); + inext = this.addCell(0, 0, toSegmentInfo(aL0, al, aR)); buf32[icell+CELL_OR] = inext; - return this.addLeft(inext, a, i); + return this.addLeft(inext, aL0, pivot); } icell = inext; continue; @@ -283,12 +327,11 @@ const SEGMENT_INFO = 2; let bi = 1; al += 1; // find 1st mismatch in rest of segment - const bR = binfo >>> 24; if ( bR !== 1 ) { for (;;) { if ( bi === bR ) { break; } if ( al === aR ) { break; } - if ( buf8[bl+bi] !== a.charCodeAt(al) ) { break; } + if ( buf8[bl+bi] !== buf8[aL+al] ) { break; } bi += 1; al += 1; } @@ -297,7 +340,7 @@ const SEGMENT_INFO = 2; if ( bi === bR ) { // needle remainder: no if ( al === aR ) { - return this.addLeft(icell, a, i); + return this.addLeft(icell, aL0, pivot); } // needle remainder: yes inext = buf32[icell+CELL_AND]; @@ -306,81 +349,97 @@ const SEGMENT_INFO = 2; continue; } // add needle remainder - icell = this.addCell(0, 0, this.addSegment(a, al, aR)); + icell = this.addCell(0, 0, toSegmentInfo(aL0, al, aR)); buf32[inext+CELL_AND] = icell; - return this.addLeft(icell, a, i); + return this.addLeft(icell, aL0, pivot); } // some characters matched // split current segment bl -= char0; buf32[icell+SEGMENT_INFO] = bi << 24 | bl; inext = this.addCell( - buf32[icell+CELL_AND], - 0, - bR - bi << 24 | bl + bi + buf32[icell+CELL_AND], 0, bR - bi << 24 | bl + bi ); buf32[icell+CELL_AND] = inext; // needle remainder: no = need boundary cell if ( al === aR ) { - return this.addLeft(icell, a, i); + return this.addLeft(icell, aL0, pivot); } // needle remainder: yes = need new cell for remaining characters - icell = this.addCell(0, 0, this.addSegment(a, al, aR)); + icell = this.addCell(0, 0, toSegmentInfo(aL0, al, aR)); buf32[inext+CELL_OR] = icell; - return this.addLeft(icell, a, i); + return this.addLeft(icell, aL0, pivot); } } - addLeft(icell, a, i) { + addLeft(icell, aL0, pivot) { const buf32 = this.buf32; + const char0 = buf32[CHAR0_SLOT]; + let aL = aL0 + char0; // fetch boundary cell - let inext = buf32[icell+CELL_AND]; + let iboundary = buf32[icell+CELL_AND]; // add boundary cell if none exist - if ( inext === 0 || buf32[inext+SEGMENT_INFO] !== 0 ) { - const iboundary = this.allocateCell(); + if ( + iboundary === 0 || + buf32[iboundary+SEGMENT_INFO] > BCELL_EXTRA_MAX + ) { + const inext = iboundary; + iboundary = this.allocateCell(); buf32[icell+CELL_AND] = iboundary; - buf32[iboundary+BCELL_RIGHT_AND] = inext; - if ( i === 0 ) { return 1; } - buf32[iboundary+BCELL_LEFT_AND] = this.allocateCell(); - inext = iboundary; + buf32[iboundary+BCELL_NEXT_AND] = inext; + if ( pivot === 0 ) { return iboundary; } } - // shortest match is always first so no point storing whatever is left - if ( buf32[inext+BCELL_LEFT_AND] === 0 ) { - return i === 0 ? 0 : 1; + // shortest match with no extra conditions will always win + if ( buf32[iboundary+BCELL_EXTRA] === 1 ) { + return iboundary; } // bail out if no left segment - if ( i === 0 ) { - buf32[inext+BCELL_LEFT_AND] = 0; - return 1; - } + if ( pivot === 0 ) { return iboundary; } // fetch root cell of left segment - icell = buf32[inext+BCELL_LEFT_AND]; + icell = buf32[iboundary+BCELL_ALT_AND]; + if ( icell === 0 ) { + icell = this.allocateCell(); + buf32[iboundary+BCELL_ALT_AND] = icell; + } // special case: first node in trie if ( buf32[icell+SEGMENT_INFO] === 0 ) { - buf32[icell+SEGMENT_INFO] = this.addSegment(a, 0, i); - return 1; + buf32[icell+SEGMENT_INFO] = toSegmentInfo(aL0, 0, pivot); + iboundary = this.allocateCell(); + buf32[icell+CELL_AND] = iboundary; + return iboundary; } - const buf8 = this.buf; - const char0 = buf32[CHAR0_SLOT]; - let ar = i; + const buf8 = this.buf8; + let ar = pivot, inext; // find a matching cell: move down for (;;) { const binfo = buf32[icell+SEGMENT_INFO]; // skip boundary cells - if ( binfo === 0 ) { - icell = buf32[icell+CELL_AND]; - continue; + if ( binfo <= BCELL_EXTRA_MAX ) { + inext = buf32[icell+CELL_AND]; + if ( inext !== 0 ) { + icell = inext; + continue; + } + iboundary = this.allocateCell(); + buf32[icell+CELL_AND] = + this.addCell(iboundary, 0, toSegmentInfo(aL0, 0, ar)); + // TODO: boundary cell might be last + // add remainder + boundary cell + return iboundary; } const bL = char0 + (binfo & 0x00FFFFFF); const bR = bL + (binfo >>> 24); let br = bR; // if first character is no match, move to next descendant - if ( buf8[br-1] !== a.charCodeAt(ar-1) ) { + if ( buf8[br-1] !== buf8[aL+ar-1] ) { inext = buf32[icell+CELL_OR]; if ( inext === 0 ) { - inext = this.addCell(0, 0, this.addSegment(a, 0, ar)); + iboundary = this.allocateCell(); + inext = this.addCell( + iboundary, 0, toSegmentInfo(aL0, 0, ar) + ); buf32[icell+CELL_OR] = inext; - return 1; + return iboundary; } icell = inext; continue; @@ -393,37 +452,52 @@ const SEGMENT_INFO = 2; for (;;) { if ( br === bL ) { break; } if ( ar === 0 ) { break; } - if ( buf8[br-1] !== a.charCodeAt(ar-1) ) { break; } + if ( buf8[br-1] !== buf8[aL+ar-1] ) { break; } br -= 1; ar -= 1; } } // all segment characters matched + // a: ...vvvvvvv + // b: vvvvvvv if ( br === bL ) { inext = buf32[icell+CELL_AND]; // needle remainder: no + // a: vvvvvvv + // b: vvvvvvv + // r: 0 & vvvvvvv if ( ar === 0 ) { // boundary cell already present - if ( inext === 0 || buf32[inext+SEGMENT_INFO] === 0 ) { - return 0; + if ( buf32[inext+BCELL_EXTRA] <= BCELL_EXTRA_MAX ) { + return inext; } // need boundary cell - buf32[icell+CELL_AND] = this.addCell(inext, 0, 0); + iboundary = this.allocateCell(); + buf32[iboundary+CELL_AND] = inext; + buf32[icell+CELL_AND] = iboundary; + return iboundary; } // needle remainder: yes + // a: yyyyyyyvvvvvvv + // b: vvvvvvv else { if ( inext !== 0 ) { icell = inext; continue; } + // TODO: we should never reach here because there will + // always be a boundary cell. + debugger; // jshint ignore:line // boundary cell + needle remainder inext = this.addCell(0, 0, 0); buf32[icell+CELL_AND] = inext; buf32[inext+CELL_AND] = - this.addCell(0, 0, this.addSegment(a, 0, ar)); + this.addCell(0, 0, toSegmentInfo(aL0, 0, ar)); } } // some segment characters matched + // a: ...vvvvvvv + // b: yyyyyyyvvvvvvv else { // split current cell buf32[icell+SEGMENT_INFO] = (bR - br) << 24 | (br - char0); @@ -432,25 +506,38 @@ const SEGMENT_INFO = 2; 0, (br - bL) << 24 | (bL - char0) ); - buf32[icell+CELL_AND] = inext; // needle remainder: no = need boundary cell + // a: vvvvvvv + // b: yyyyyyyvvvvvvv + // r: yyyyyyy & 0 & vvvvvvv if ( ar === 0 ) { - buf32[icell+CELL_AND] = this.addCell(inext, 0, 0); + iboundary = this.allocateCell(); + buf32[icell+CELL_AND] = iboundary; + buf32[iboundary+CELL_AND] = inext; + return iboundary; } - // needle remainder: yes = need new cell for remaining characters + // needle remainder: yes = need new cell for remaining + // characters + // a: wwwwvvvvvvv + // b: yyyyyyyvvvvvvv + // r: (0 & wwww | yyyyyyy) & vvvvvvv else { - buf32[inext+CELL_OR] = - this.addCell(0, 0, this.addSegment(a, 0, ar)); + buf32[icell+CELL_AND] = inext; + iboundary = this.allocateCell(); + buf32[inext+CELL_OR] = this.addCell( + iboundary, 0, toSegmentInfo(aL0, 0, ar) + ); + return iboundary; } } - return 1; + //debugger; // jshint ignore:line } } optimize() { this.shrinkBuf(); return { - byteLength: this.buf.byteLength, + byteLength: this.buf8.byteLength, char0: this.buf32[CHAR0_SLOT], }; } @@ -477,19 +564,117 @@ const SEGMENT_INFO = 2; ? decoder.decodeSize(selfie) : selfie.length << 2; if ( byteLength === 0 ) { return false; } - byteLength = byteLength + PAGE_SIZE-1 & ~(PAGE_SIZE-1); - if ( byteLength > this.buf.length ) { - this.buf = new Uint8Array(byteLength); - this.buf32 = new Uint32Array(this.buf.buffer); + byteLength = roundToPageSize(byteLength); + if ( byteLength > this.buf8.length ) { + this.buf8 = new Uint8Array(byteLength); + this.buf32 = new Uint32Array(this.buf8.buffer); + this.haystack = this.buf8.subarray( + HAYSTACK_START, + HAYSTACK_START + HAYSTACK_SIZE + ); } if ( shouldDecode ) { - decoder.decode(selfie, this.buf.buffer); + decoder.decode(selfie, this.buf8.buffer); } else { this.buf32.set(selfie); } return true; } + storeString(s) { + const n = s.length; + if ( (this.buf8.length - this.buf32[CHAR1_SLOT]) < n ) { + this.growBuf(0, n); + } + const offset = this.buf32[CHAR1_SLOT]; + this.buf32[CHAR1_SLOT] = offset + n; + const buf8 = this.buf8; + for ( let i = 0; i < n; i++ ) { + buf8[offset+i] = s.charCodeAt(i); + } + return offset - this.buf32[CHAR0_SLOT]; + } + + extractString(i, n) { + if ( this.textDecoder === null ) { + this.textDecoder = new TextDecoder(); + } + const offset = this.buf32[CHAR0_SLOT] + i; + return this.textDecoder.decode( + this.buf8.subarray(offset, offset + n) + ); + } + + // WASMable. + startsWith(haystackOffset, needleOffset, needleLen) { + if ( (haystackOffset + needleLen) > this.haystackSize ) { + return false; + } + const haystackCodes = this.haystack; + const needleCodes = this.buf8; + needleOffset += this.buf32[CHAR0_SLOT]; + for ( let i = 0; i < needleLen; i++ ) { + if ( + haystackCodes[haystackOffset+i] !== + needleCodes[needleOffset+i] + ) { + return false; + } + } + return true; + } + + // Find the left-most instance of substring in main string + // WASMable. + indexOf(haystackBeg, haystackEnd, needleOffset, needleLen) { + haystackEnd -= needleLen; + if ( haystackEnd < haystackBeg ) { return -1; } + const haystackCodes = this.haystack; + const needleCodes = this.buf8; + needleOffset += this.buf32[CHAR0_SLOT]; + let i = haystackBeg; + let c0 = needleCodes[needleOffset]; + for (;;) { + i = haystackCodes.indexOf(c0, i); + if ( i === -1 || i > haystackEnd ) { return -1; } + let j = 1; + while ( j < needleLen ) { + if ( haystackCodes[i+j] !== needleCodes[needleOffset+j] ) { + break; + } + j += 1; + } + if ( j === needleLen ) { return i; } + i += 1; + } + return -1; + } + + // Find the right-most instance of substring in main string. + // WASMable. + lastIndexOf(haystackBeg, haystackEnd, needleOffset, needleLen) { + needleOffset += this.buf32[CHAR0_SLOT]; + let i = haystackEnd - needleLen; + if ( i < haystackBeg ) { return -1; } + const haystackCodes = this.haystack; + const needleCodes = this.buf8; + let c0 = needleCodes[needleOffset]; + for (;;) { + i = haystackCodes.lastIndexOf(c0, i); + if ( i === -1 || i < haystackBeg ) { return -1; } + let j = 1; + while ( j < needleLen ) { + if ( haystackCodes[i+j] !== needleCodes[needleOffset+j] ) { + break; + } + j += 1; + } + if ( j === needleLen ) { return i; } + i -= 1; + } + return -1; + } + //-------------------------------------------------------------------------- // Private methods //-------------------------------------------------------------------------- @@ -512,28 +697,15 @@ const SEGMENT_INFO = 2; return icell; } - addSegment(s, l, r) { - const n = r - l; - if ( n === 0 ) { return 0; } - const buf32 = this.buf32; - const des = buf32[CHAR1_SLOT]; - buf32[CHAR1_SLOT] = des + n; - const buf8 = this.buf; - for ( let i = 0; i < n; i++ ) { - buf8[des+i] = s.charCodeAt(l+i); - } - return (n << 24) | (des - buf32[CHAR0_SLOT]); - } - growBuf(trieGrow, charGrow) { const char0 = Math.max( - (this.buf32[TRIE1_SLOT] + trieGrow + PAGE_SIZE-1) & ~(PAGE_SIZE-1), + roundToPageSize(this.buf32[TRIE1_SLOT] + trieGrow), this.buf32[CHAR0_SLOT] ); const char1 = char0 + this.buf32[CHAR1_SLOT] - this.buf32[CHAR0_SLOT]; const bufLen = Math.max( - (char1 + charGrow + PAGE_SIZE-1) & ~(PAGE_SIZE-1), - this.buf.length + roundToPageSize(char1 + charGrow), + this.buf8.length ); this.resizeBuf(bufLen, char0); } @@ -546,46 +718,26 @@ const SEGMENT_INFO = 2; } resizeBuf(bufLen, char0) { - bufLen = bufLen + PAGE_SIZE-1 & ~(PAGE_SIZE-1); - if ( - bufLen === this.buf.length && - char0 === this.buf32[CHAR0_SLOT] - ) { + bufLen = roundToPageSize(bufLen); + if ( bufLen === this.buf8.length && char0 === this.buf32[CHAR0_SLOT] ) { return; } const charDataLen = this.buf32[CHAR1_SLOT] - this.buf32[CHAR0_SLOT]; - if ( bufLen !== this.buf.length ) { + if ( bufLen !== this.buf8.length ) { const newBuf = new Uint8Array(bufLen); - newBuf.set( - new Uint8Array( - this.buf.buffer, - 0, - this.buf32[TRIE1_SLOT] - ), - 0 - ); - newBuf.set( - new Uint8Array( - this.buf.buffer, - this.buf32[CHAR0_SLOT], - charDataLen - ), - char0 - ); - this.buf = newBuf; - this.buf32 = new Uint32Array(this.buf.buffer); + newBuf.set(this.buf8.subarray(0, this.buf32[TRIE1_SLOT]), 0); + newBuf.set(this.buf8.subarray(this.buf32[CHAR0_SLOT], this.buf32[CHAR1_SLOT]), char0); + this.buf8 = newBuf; + this.buf32 = new Uint32Array(this.buf8.buffer); this.buf32[CHAR0_SLOT] = char0; this.buf32[CHAR1_SLOT] = char0 + charDataLen; + this.haystack = this.buf8.subarray( + HAYSTACK_START, + HAYSTACK_START + HAYSTACK_SIZE + ); } if ( char0 !== this.buf32[CHAR0_SLOT] ) { - this.buf.set( - new Uint8Array( - this.buf.buffer, - this.buf32[CHAR0_SLOT], - charDataLen - ), - char0 - ); + this.buf8.copyWithin(char0, this.buf32[CHAR0_SLOT], this.buf32[CHAR1_SLOT]); this.buf32[CHAR0_SLOT] = char0; this.buf32[CHAR1_SLOT] = char0 + charDataLen; } @@ -605,16 +757,24 @@ const SEGMENT_INFO = 2; this.size = size; } - add(s, i = 0) { - if ( this.container.add(this.iroot, s, i) === 1 ) { + add(i, n, pivot = 0) { + const iboundary = this.container.add(this.iroot, i, n, pivot); + if ( iboundary !== 0 ) { this.size += 1; - return true; } - return false; + return iboundary; } - matches(a, i) { - return this.container.matches(this.iroot, a, i); + getExtra(iboundary) { + return this.container.buf32[iboundary+BCELL_EXTRA]; + } + + setExtra(iboundary, v) { + this.container.buf32[iboundary+BCELL_EXTRA] = v; + } + + matches(i) { + return this.container.matches(this.iroot, i); } dump() { @@ -623,6 +783,10 @@ const SEGMENT_INFO = 2; } } + get $l() { return this.container.$l; } + get $r() { return this.container.$r; } + get $iu() { return this.container.$iu; } + [Symbol.iterator]() { return { value: undefined, @@ -646,7 +810,7 @@ const SEGMENT_INFO = 2; let i0 = this.container.buf32[CHAR0_SLOT] + (v & 0x00FFFFFF); const i1 = i0 + (v >>> 24); while ( i0 < i1 ) { - this.charBuf[this.charPtr] = this.container.buf[i0]; + this.charBuf[this.charPtr] = this.container.buf8[i0]; this.charPtr += 1; i0 += 1; } diff --git a/src/js/traffic.js b/src/js/traffic.js index bfadd3f69..a69f1796a 100644 --- a/src/js/traffic.js +++ b/src/js/traffic.js @@ -221,7 +221,6 @@ const onBeforeRootFrameRequest = function(fctxt) { url: requestURL, hn: requestHostname, dn: fctxt.getDomain() || requestHostname, - fc: logData.compiled, fs: logData.raw })); @@ -848,7 +847,7 @@ const injectCSP = function(fctxt, pageStore, responseHeaders) { µb.staticNetFilteringEngine.matchAndFetchData(fctxt, 'csp'); for ( const directive of staticDirectives ) { if ( directive.result !== 1 ) { continue; } - cspSubsets.push(directive.data); + cspSubsets.push(directive.getData('csp')); } // URL filtering `allow` rules override static filtering. diff --git a/src/js/utils.js b/src/js/utils.js index 78cf18aa2..8e2a0f84d 100644 --- a/src/js/utils.js +++ b/src/js/utils.js @@ -48,7 +48,6 @@ for ( let i = 0, n = this._chars.length; i < n; i++ ) { this._validTokenChars[this._chars.charCodeAt(i)] = i + 1; } - // Four upper bits of token hash are reserved for built-in predefined // token hashes, which should never end up being used when tokenizing // any arbitrary string. @@ -62,10 +61,14 @@ this._urlIn = ''; this._urlOut = ''; this._tokenized = false; - this._tokens = [ 0 ]; + this._tokens = new Uint32Array(1024); this.knownTokens = new Uint8Array(65536); this.resetKnownTokens(); + this.MAX_TOKEN_LENGTH = 7; + + this.charCodes = new Uint8Array(2048); + this.charCodeCount = 0; } setURL(url) { @@ -91,17 +94,24 @@ } // Tokenize on demand. - getTokens() { + getTokens(encodeInto) { if ( this._tokenized ) { return this._tokens; } - let i = this._tokenize(); - i = this._appendTokenAt(i, this.anyTokenHash, 0); + let i = this._tokenize(encodeInto); + this._tokens[i+0] = this.anyTokenHash; + this._tokens[i+1] = 0; + i += 2; if ( this._urlOut.startsWith('https://') ) { - i = this._appendTokenAt(i, this.anyHTTPSTokenHash, 0); + this._tokens[i+0] = this.anyHTTPSTokenHash; + this._tokens[i+1] = 0; + i += 2; } else if ( this._urlOut.startsWith('http://') ) { - i = this._appendTokenAt(i, this.anyHTTPTokenHash, 0); + this._tokens[i+0] = this.anyHTTPTokenHash; + this._tokens[i+1] = 0; + i += 2; } - i = this._appendTokenAt(i, this.noTokenHash, 0); - this._tokens[i] = 0; + this._tokens[i+0] = this.noTokenHash; + this._tokens[i+1] = 0; + this._tokens[i+2] = 0; this._tokenized = true; return this._tokens; } @@ -136,13 +146,7 @@ // https://github.com/chrisaljoudi/uBlock/issues/1118 // We limit to a maximum number of tokens. - _appendTokenAt(i, th, ti) { - this._tokens[i+0] = th; - this._tokens[i+1] = ti; - return i + 2; - } - - _tokenize() { + _tokenize(encodeInto) { const tokens = this._tokens; let url = this._urlOut; let l = url.length; @@ -151,19 +155,27 @@ url = url.slice(0, 2048); l = 2048; } + encodeInto.haystackSize = l; const knownTokens = this.knownTokens; const vtc = this._validTokenChars; - let i = 0, j = 0, v, n, ti, th; + const charCodes = encodeInto.haystack; + let i = 0, j = 0, c, v, n, ti, th; for (;;) { for (;;) { if ( i === l ) { return j; } - v = vtc[url.charCodeAt(i++)]; + c = url.charCodeAt(i); + charCodes[i] = c; + v = vtc[c]; + i += 1; if ( v !== 0 ) { break; } } th = v; ti = i - 1; n = 1; for (;;) { if ( i === l ) { break; } - v = vtc[url.charCodeAt(i++)]; + c = url.charCodeAt(i); + charCodes[i] = c; + v = vtc[c]; + i += 1; if ( v === 0 ) { break; } if ( n === 7 ) { continue; } th = th << 4 ^ v; @@ -292,7 +304,12 @@ this.properties = new Map(); } push(args) { - this.block[this.block.length] = this.stringifier(args); + this.block.push(this.stringifier(args)); + } + last() { + if ( Array.isArray(this.block) && this.block.length !== 0 ) { + return this.block[this.block.length - 1]; + } } select(blockId) { if ( blockId === this.blockId ) { return; }