From 85c68116bd5a3091c7e184f05a7ba3ed9ab373dc Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Wed, 4 Aug 2021 15:14:48 -0400 Subject: [PATCH] Group all compiling-related code into FilterCompiler() class In the static network filtering engine (snfe), the compiling-related code was spread across two classes. This commit makes it so that all the compiling-related code is in FilterCompiler class, which clear purpose is to compile raw filters into a form which can be persisted and later fed to the snfe with no parsing overhead. To compile raw static network filter, the new approach is: snfe.createCompiler(parser); Then for each single raw filter to compile: compiler.compile(parser, writer); The caller is responsible to keep a reference to the compiler instance for as long as it is needed. This removes the need for the clunky code used to keep an instance of compiler alive in the snfe. Additionally, snfe.tokenHistograms() has been moved to benchmarks.js, as it has no dependency on the snfe, it's just a utility function. --- platform/browser/main.js | 9 +- platform/nodejs/index.js | 7 +- src/js/benchmarks.js | 41 ++ src/js/reverselookup.js | 5 +- src/js/static-net-filtering.js | 745 +++++++++++++++------------------ src/js/storage.js | 7 +- src/js/tab.js | 11 +- 7 files changed, 395 insertions(+), 430 deletions(-) diff --git a/platform/browser/main.js b/platform/browser/main.js index 5beee3c46..082fd11a4 100644 --- a/platform/browser/main.js +++ b/platform/browser/main.js @@ -27,10 +27,10 @@ import './lib/publicsuffixlist/publicsuffixlist.js'; import './lib/punycode.js'; import globals from './js/globals.js'; +import staticNetFilteringEngine from './js/static-net-filtering.js'; import { FilteringContext } from './js/filtering-context.js'; import { LineIterator } from './js/text-utils.js'; import { StaticFilteringParser } from './js/static-filtering-parser.js'; -import { staticNetFilteringEngine } from './js/static-net-filtering.js'; import { CompiledListReader, @@ -42,6 +42,7 @@ import { function compileList(rawText, writer) { const lineIter = new LineIterator(rawText); const parser = new StaticFilteringParser(true); + const compiler = staticNetFilteringEngine.createCompiler(parser); parser.setMaxTokenLength(staticNetFilteringEngine.MAX_TOKEN_LENGTH); @@ -59,12 +60,12 @@ function compileList(rawText, writer) { if ( parser.patternHasUnicode() && parser.toASCII() === false ) { continue; } - if ( staticNetFilteringEngine.compile(parser, writer) ) { continue; } - if ( staticNetFilteringEngine.error !== undefined ) { + if ( compiler.compile(parser, writer) ) { continue; } + if ( compiler.error !== undefined ) { console.info(JSON.stringify({ realm: 'message', type: 'error', - text: staticNetFilteringEngine.error + text: compiler.error })); } } diff --git a/platform/nodejs/index.js b/platform/nodejs/index.js index 877cc1193..c70363630 100644 --- a/platform/nodejs/index.js +++ b/platform/nodejs/index.js @@ -54,6 +54,7 @@ function loadJSON(path) { function compileList(rawText, writer, options = {}) { const lineIter = new LineIterator(rawText); const parser = new StaticFilteringParser(true); + const compiler = snfe.createCompiler(parser); const events = Array.isArray(options.events) ? options.events : undefined; parser.setMaxTokenLength(snfe.MAX_TOKEN_LENGTH); @@ -70,11 +71,11 @@ function compileList(rawText, writer, options = {}) { if ( parser.patternHasUnicode() && parser.toASCII() === false ) { continue; } - if ( snfe.compile(parser, writer) ) { continue; } - if ( snfe.error !== undefined && events !== undefined ) { + if ( compiler.compile(parser, writer) ) { continue; } + if ( compiler.error !== undefined && events !== undefined ) { options.events.push({ type: 'error', - text: snfe.error + text: compiler.error }); } } diff --git a/src/js/benchmarks.js b/src/js/benchmarks.js index 7f9405f4f..1c5c4c6d8 100644 --- a/src/js/benchmarks.js +++ b/src/js/benchmarks.js @@ -199,6 +199,47 @@ const loadBenchmarkDataset = (( ) => { /******************************************************************************/ +µb.tokenHistograms = async function() { + const requests = await loadBenchmarkDataset(); + if ( Array.isArray(requests) === false || requests.length === 0 ) { + console.info('No requests found to benchmark'); + return; + } + + console.info(`Computing token histograms...`); + + const fctxt = new FilteringContext(); + const missTokenMap = new Map(); + const hitTokenMap = new Map(); + const reTokens = /[0-9a-z%]{2,}/g; + + for ( let i = 0; i < requests.length; i++ ) { + const request = requests[i]; + fctxt.setURL(request.url); + fctxt.setDocOriginFromURL(request.frameUrl); + fctxt.setType(request.cpt); + const r = staticNetFilteringEngine.matchRequest(fctxt); + for ( let [ keyword ] of request.url.toLowerCase().matchAll(reTokens) ) { + const token = keyword.slice(0, 7); + if ( r === 0 ) { + missTokenMap.set(token, (missTokenMap.get(token) || 0) + 1); + } else if ( r === 1 ) { + hitTokenMap.set(token, (hitTokenMap.get(token) || 0) + 1); + } + } + } + const customSort = (a, b) => b[1] - a[1]; + const topmisses = Array.from(missTokenMap).sort(customSort).slice(0, 100); + for ( const [ token ] of topmisses ) { + hitTokenMap.delete(token); + } + const tophits = Array.from(hitTokenMap).sort(customSort).slice(0, 100); + console.info('Misses:', JSON.stringify(topmisses)); + console.info('Hits:', JSON.stringify(tophits)); +}; + +/******************************************************************************/ + µb.benchmarkDynamicNetFiltering = async function() { const requests = await loadBenchmarkDataset(); if ( Array.isArray(requests) === false || requests.length === 0 ) { diff --git a/src/js/reverselookup.js b/src/js/reverselookup.js index aa3bf01ae..28e40c9ec 100644 --- a/src/js/reverselookup.js +++ b/src/js/reverselookup.js @@ -137,9 +137,8 @@ const fromNetFilter = async function(rawFilter) { parser.setMaxTokenLength(staticNetFilteringEngine.MAX_TOKEN_LENGTH); parser.analyze(rawFilter); - if ( staticNetFilteringEngine.compile(parser, writer) === false ) { - return; - } + const compiler = staticNetFilteringEngine.createCompiler(parser); + if ( compiler.compile(parser, writer) === false ) { return; } await initWorker(); diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 419f300bf..4f6c147cd 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -171,6 +171,16 @@ const MAX_TOKEN_LENGTH = 7; const COMPILED_BAD_SECTION = 1; +// Four upper bits of token hash are reserved for built-in predefined +// token hashes, which should never end up being used when tokenizing +// any arbitrary string. +const NO_TOKEN_HASH = 0x50000000; +const DOT_TOKEN_HASH = 0x10000000; +const ANY_TOKEN_HASH = 0x20000000; +const ANY_HTTPS_TOKEN_HASH = 0x30000000; +const ANY_HTTP_TOKEN_HASH = 0x40000000; +const EMPTY_TOKEN_HASH = 0xF0000000; + /******************************************************************************/ // See the following as short-lived registers, used during evaluation. They are @@ -261,56 +271,74 @@ restrFromGenericPattern.reAsterisks = /\*+/g; restrFromGenericPattern.restrHostnameAnchor1 = '^[a-z-]+://(?:[^/?#]+\\.)?'; restrFromGenericPattern.restrHostnameAnchor2 = '^[a-z-]+://(?:[^/?#]+)?'; -const toLogDataInternal = function(categoryBits, tokenHash, iunit) { - if ( iunit === 0 ) { return; } - const pattern = []; - const regex = []; - const options = []; - const denyallow = []; - const domains = []; - const logData = { - pattern, - regex, - denyallow, - domains, - options, - isRegex: false, - }; - filterUnits[iunit].logData(logData); - if ( (categoryBits & Important) !== 0 ) { - logData.options.unshift('important'); +/******************************************************************************/ + +class LogData { + constructor(categoryBits, tokenHash, iunit) { + this.result = 0; + this.source = 'static'; + this.tokenHash = tokenHash; + if ( iunit === 0 ) { + this.raw = this.regex = ''; + return; + } + this.result = (categoryBits & AllowAction) === 0 ? 1 : 2; + const pattern = []; + const regex = []; + const options = []; + const denyallow = []; + const domains = []; + const logData = { + pattern, + regex, + denyallow, + domains, + options, + isRegex: false, + }; + filterUnits[iunit].logData(logData); + if ( (categoryBits & Important) !== 0 ) { + logData.options.unshift('important'); + } + if ( (categoryBits & ThirdParty) !== 0 ) { + logData.options.unshift('3p'); + } else if ( (categoryBits & FirstParty) !== 0 ) { + logData.options.unshift('1p'); + } + const type = categoryBits & TypeBitsMask; + if ( type !== 0 ) { + logData.options.unshift(typeValueToTypeName[type >>> TypeBitsOffset]); + } + let raw = logData.pattern.join(''); + if ( + logData.isRegex === false && + raw.charCodeAt(0) === 0x2F /* '/' */ && + raw.charCodeAt(raw.length - 1) === 0x2F /* '/' */ + ) { + raw += '*'; + } + if ( (categoryBits & AllowAction) !== 0 ) { + raw = '@@' + raw; + } + if ( denyallow.length !== 0 ) { + options.push(`denyallow=${denyallow.join('|')}`); + } + if ( domains.length !== 0 ) { + options.push(`domain=${domains.join('|')}`); + } + if ( options.length !== 0 ) { + raw += '$' + options.join(','); + } + this.raw = raw; + this.regex = logData.regex.join(''); } - if ( (categoryBits & ThirdParty) !== 0 ) { - logData.options.unshift('3p'); - } else if ( (categoryBits & FirstParty) !== 0 ) { - logData.options.unshift('1p'); + isUntokenized() { + return this.tokenHash === NO_TOKEN_HASH; } - const type = categoryBits & TypeBitsMask; - if ( type !== 0 ) { - logData.options.unshift(typeValueToTypeName[type >>> TypeBitsOffset]); + isPureHostname() { + return this.tokenHash === DOT_TOKEN_HASH; } - let raw = logData.pattern.join(''); - if ( - logData.isRegex === false && - raw.charCodeAt(0) === 0x2F /* '/' */ && - raw.charCodeAt(raw.length - 1) === 0x2F /* '/' */ - ) { - raw += '*'; - } - if ( (categoryBits & AllowAction) !== 0 ) { - raw = '@@' + raw; - } - if ( denyallow.length !== 0 ) { - options.push(`denyallow=${denyallow.join('|')}`); - } - if ( domains.length !== 0 ) { - options.push(`domain=${domains.join('|')}`); - } - if ( options.length !== 0 ) { - raw += '$' + options.join(','); - } - return { raw, regex: logData.regex.join('') }; -}; +} /******************************************************************************/ @@ -462,7 +490,7 @@ const filterPattern = { units.push(FilterTrue.compile()); return; } - if ( parsed.tokenHash === parsed.noTokenHash ) { + if ( parsed.tokenHash === NO_TOKEN_HASH ) { units.push(FilterPatternGeneric.compile(parsed)); return; } @@ -1723,8 +1751,7 @@ const FilterModifierResult = class { } logData() { - const r = toLogDataInternal(this.bits, this.th, this.iunit); - r.source = 'static'; + const r = new LogData(this.bits, this.th, this.iunit); r.result = this.result; r.modifier = true; return r; @@ -2492,15 +2519,6 @@ const urlTokenizer = new (class { for ( let i = 0, n = this._chars.length; i < n; i++ ) { this._validTokenChars[this._chars.charCodeAt(i)] = i + 1; } - // Four upper bits of token hash are reserved for built-in predefined - // token hashes, which should never end up being used when tokenizing - // any arbitrary string. - this.dotTokenHash = 0x10000000; - this.anyTokenHash = 0x20000000; - this.anyHTTPSTokenHash = 0x30000000; - this.anyHTTPTokenHash = 0x40000000; - this.noTokenHash = 0x50000000; - this.emptyTokenHash = 0xF0000000; this._urlIn = ''; this._urlOut = ''; @@ -2512,7 +2530,6 @@ const urlTokenizer = new (class { this.knownTokens = new Uint8Array(65536); this.resetKnownTokens(); - this.MAX_TOKEN_LENGTH = MAX_TOKEN_LENGTH; } setURL(url) { @@ -2527,11 +2544,11 @@ const urlTokenizer = new (class { resetKnownTokens() { this.knownTokens.fill(0); - this.addKnownToken(this.dotTokenHash); - this.addKnownToken(this.anyTokenHash); - this.addKnownToken(this.anyHTTPSTokenHash); - this.addKnownToken(this.anyHTTPTokenHash); - this.addKnownToken(this.noTokenHash); + this.addKnownToken(DOT_TOKEN_HASH); + this.addKnownToken(ANY_TOKEN_HASH); + this.addKnownToken(ANY_HTTPS_TOKEN_HASH); + this.addKnownToken(ANY_HTTP_TOKEN_HASH); + this.addKnownToken(NO_TOKEN_HASH); } addKnownToken(th) { @@ -2542,19 +2559,19 @@ const urlTokenizer = new (class { getTokens(encodeInto) { if ( this._tokenized ) { return this._tokens; } let i = this._tokenize(encodeInto); - this._tokens[i+0] = this.anyTokenHash; + this._tokens[i+0] = ANY_TOKEN_HASH; this._tokens[i+1] = 0; i += 2; if ( this._urlOut.startsWith('https://') ) { - this._tokens[i+0] = this.anyHTTPSTokenHash; + this._tokens[i+0] = ANY_HTTPS_TOKEN_HASH; this._tokens[i+1] = 0; i += 2; } else if ( this._urlOut.startsWith('http://') ) { - this._tokens[i+0] = this.anyHTTPTokenHash; + this._tokens[i+0] = ANY_HTTP_TOKEN_HASH; this._tokens[i+1] = 0; i += 2; } - this._tokens[i+0] = this.noTokenHash; + this._tokens[i+0] = NO_TOKEN_HASH; this._tokens[i+1] = 0; this._tokens[i+2] = 0; this._tokenized = true; @@ -2571,7 +2588,7 @@ const urlTokenizer = new (class { tokenHashFromString(s) { const l = s.length; - if ( l === 0 ) { return this.emptyTokenHash; } + if ( l === 0 ) { return EMPTY_TOKEN_HASH; } const vtc = this._validTokenChars; let th = vtc[s.charCodeAt(0)]; for ( let i = 1; i !== 7 /* MAX_TOKEN_LENGTH */ && i !== l; i++ ) { @@ -2656,12 +2673,11 @@ const urlTokenizer = new (class { /******************************************************************************/ /******************************************************************************/ -const FilterParser = class { +class FilterCompiler { constructor(parser, other = undefined) { if ( other !== undefined ) { return Object.assign(this, other); } - this.noTokenHash = urlTokenizer.noTokenHash; this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/; this.reToken = /[%0-9A-Za-z]+/g; this.domainOptList = []; @@ -2794,7 +2810,6 @@ const FilterParser = class { [ 'crop',1431 ], [ 'new',1412], ]); - this.maxTokenLen = urlTokenizer.MAX_TOKEN_LENGTH; this.reset(parser); } @@ -2810,9 +2825,9 @@ const FilterParser = class { // 0101 (0x5): anchored to the hostname and end of the URL. this.anchor = 0; this.badFilter = false; + this.error = undefined; this.modifyType = undefined; this.modifyValue = undefined; - this.invalid = false; this.pattern = ''; this.patternMatchCase = false; this.party = AnyParty; @@ -2824,7 +2839,7 @@ const FilterParser = class { this.isRegex = false; this.strictParty = 0; this.token = '*'; - this.tokenHash = this.noTokenHash; + this.tokenHash = NO_TOKEN_HASH; this.tokenBeg = 0; this.typeBits = 0; this.notTypeBits = 0; @@ -2832,12 +2847,11 @@ const FilterParser = class { this.secondWildcardPos = -1; this.firstCaretPos = -1; this.secondCaretPos = -1; - this.unsupported = false; return this; } clone() { - return new FilterParser(this.parser, this); + return new FilterCompiler(this.parser, this); } normalizeRegexSource(s) { @@ -2852,7 +2866,7 @@ const FilterParser = class { // https://github.com/chrisaljoudi/uBlock/issues/589 // Be ready to handle multiple negated types - parseTypeOption(id, not) { + processTypeOption(id, not) { const typeBit = id !== -1 ? this.tokenIdToNormalizedType.get(id) : allTypesBits; @@ -2863,14 +2877,14 @@ const FilterParser = class { } } - parsePartyOption(firstParty, not) { + processPartyOption(firstParty, not) { if ( not ) { firstParty = !firstParty; } this.party |= firstParty ? FirstParty : ThirdParty; } - parseHostnameList(s, modeBits, out = []) { + processHostnameList(s, modeBits, out = []) { let beg = 0; let slen = s.length; let i = 0; @@ -2890,7 +2904,7 @@ const FilterParser = class { return i === 1 ? out[0] : out.join('|'); } - parseModifierOption(modifier, value) { + processModifierOption(modifier, value) { if ( this.modifyType !== undefined ) { return false; } this.modifyType = modifier; if ( value !== undefined ) { @@ -2901,32 +2915,32 @@ const FilterParser = class { return true; } - parseOptions() { + processOptions() { for ( let { id, val, not } of this.parser.netOptions() ) { switch ( id ) { case this.parser.OPTToken1p: - this.parsePartyOption(true, not); + this.processPartyOption(true, not); break; case this.parser.OPTToken1pStrict: this.strictParty = this.strictParty === -1 ? 0 : 1; this.optionUnitBits |= this.STRICT_PARTY_BIT; break; case this.parser.OPTToken3p: - this.parsePartyOption(false, not); + this.processPartyOption(false, not); break; case this.parser.OPTToken3pStrict: this.strictParty = this.strictParty === 1 ? 0 : -1; this.optionUnitBits |= this.STRICT_PARTY_BIT; break; case this.parser.OPTTokenAll: - this.parseTypeOption(-1); + this.processTypeOption(-1); break; // https://github.com/uBlockOrigin/uAssets/issues/192 case this.parser.OPTTokenBadfilter: this.badFilter = true; break; case this.parser.OPTTokenCsp: - if ( this.parseModifierOption(id, val) === false ) { + if ( this.processModifierOption(id, val) === false ) { return false; } if ( val !== undefined && this.reBadCSP.test(val) ) { @@ -2938,7 +2952,7 @@ const FilterParser = class { // Detect and discard filter if domain option contains // nonsensical characters. case this.parser.OPTTokenDomain: - this.domainOpt = this.parseHostnameList( + this.domainOpt = this.processHostnameList( val, 0b1010, this.domainOptList @@ -2947,15 +2961,15 @@ const FilterParser = class { this.optionUnitBits |= this.DOMAIN_BIT; break; case this.parser.OPTTokenDenyAllow: - this.denyallowOpt = this.parseHostnameList(val, 0b0000); + this.denyallowOpt = this.processHostnameList(val, 0b0000); if ( this.denyallowOpt === '' ) { return false; } this.optionUnitBits |= this.DENYALLOW_BIT; break; // https://www.reddit.com/r/uBlockOrigin/comments/d6vxzj/ // Add support for `elemhide`. Rarely used but it happens. case this.parser.OPTTokenEhide: - this.parseTypeOption(this.parser.OPTTokenShide, not); - this.parseTypeOption(this.parser.OPTTokenGhide, not); + this.processTypeOption(this.parser.OPTTokenShide, not); + this.processTypeOption(this.parser.OPTTokenGhide, not); break; case this.parser.OPTTokenHeader: this.headerOpt = val !== undefined ? val : ''; @@ -2971,7 +2985,7 @@ const FilterParser = class { id = this.action === AllowAction ? this.parser.OPTTokenRedirectRule : this.parser.OPTTokenRedirect; - if ( this.parseModifierOption(id, 'empty') === false ) { + if ( this.processModifierOption(id, 'empty') === false ) { return false; } this.optionUnitBits |= this.REDIRECT_BIT; @@ -2983,7 +2997,7 @@ const FilterParser = class { id = this.action === AllowAction ? this.parser.OPTTokenRedirectRule : this.parser.OPTTokenRedirect; - if ( this.parseModifierOption(id, 'noopmp4-1s') === false ) { + if ( this.processModifierOption(id, 'noopmp4-1s') === false ) { return false; } this.optionUnitBits |= this.REDIRECT_BIT; @@ -2991,7 +3005,7 @@ const FilterParser = class { case this.parser.OPTTokenNoop: break; case this.parser.OPTTokenQueryprune: - if ( this.parseModifierOption(id, val) === false ) { + if ( this.processModifierOption(id, val) === false ) { return false; } this.optionUnitBits |= this.QUERYPRUNE_BIT; @@ -3000,13 +3014,13 @@ const FilterParser = class { if ( this.action === AllowAction ) { id = this.parser.OPTTokenRedirectRule; } - if ( this.parseModifierOption(id, val) === false ) { + if ( this.processModifierOption(id, val) === false ) { return false; } this.optionUnitBits |= this.REDIRECT_BIT; break; case this.parser.OPTTokenRedirectRule: - if ( this.parseModifierOption(id, val) === false ) { + if ( this.processModifierOption(id, val) === false ) { return false; } this.optionUnitBits |= this.REDIRECT_BIT; @@ -3017,7 +3031,7 @@ const FilterParser = class { if ( this.tokenIdToNormalizedType.has(id) === false ) { return false; } - this.parseTypeOption(id, not); + this.processTypeOption(id, not); break; } } @@ -3025,6 +3039,7 @@ const FilterParser = class { if ( this.party === AllParties ) { this.party = AnyParty; } + // Negated network types? Toggle on all network type bits. // Negated non-network types can only toggle themselves. // @@ -3043,13 +3058,15 @@ const FilterParser = class { this.typeBits &= ~this.notTypeBits; if ( this.typeBits === 0 ) { return false; } } + // CSP directives implicitly apply only to document/subdocument. if ( this.modifyType === this.parser.OPTTokenCsp ) { if ( this.typeBits === 0 ) { - this.parseTypeOption(this.parser.OPTTokenDoc, false); - this.parseTypeOption(this.parser.OPTTokenFrame, false); + this.processTypeOption(this.parser.OPTTokenDoc, false); + this.processTypeOption(this.parser.OPTTokenFrame, false); } } + // https://github.com/gorhill/uBlock/issues/2283 // Abort if type is only for unsupported types, otherwise // toggle off `unsupported` bit. @@ -3057,23 +3074,22 @@ const FilterParser = class { this.typeBits &= ~unsupportedTypeBit; if ( this.typeBits === 0 ) { return false; } } + return true; } - parse(parser) { + process(parser) { // important! this.reset(parser); if ( parser.hasError() ) { - this.invalid = true; - return this; + return this.FILTER_INVALID; } // Filters which pattern is a single character other than `*` and have // no narrowing options are discarded as invalid. if ( parser.patternIsDubious() ) { - this.invalid = true; - return this; + return this.FILTER_INVALID; } // block or allow filter? @@ -3088,13 +3104,12 @@ const FilterParser = class { if ( this.isPureHostname && parser.hasOptions() === false ) { this.pattern = parser.patternToLowercase(); this.anchor |= 0b100; - return this; + return this.FILTER_OK; } // options - if ( parser.hasOptions() && this.parseOptions() === false ) { - this.unsupported = true; - return this; + if ( parser.hasOptions() && this.processOptions() === false ) { + return this.FILTER_UNSUPPORTED; } // regex? @@ -3105,17 +3120,14 @@ const FilterParser = class { // source string -- this ensure reverse-lookup will work fine. this.pattern = this.normalizeRegexSource(parser.getNetPattern()); if ( this.pattern === '' ) { - this.unsupported = true; + return this.FILTER_UNSUPPORTED; } - return this; + return this.FILTER_OK; } - let pattern; - if ( parser.patternIsMatchAll() ) { - pattern = '*'; - } else { - pattern = parser.patternToLowercase(); - } + const pattern = parser.patternIsMatchAll() + ? '*' + : parser.patternToLowercase(); if ( parser.patternIsLeftHostnameAnchored() ) { this.anchor |= 0b100; @@ -3143,13 +3155,11 @@ const FilterParser = class { } if ( pattern.length > 1024 ) { - this.unsupported = true; - return this; + return this.FILTER_UNSUPPORTED; } this.pattern = pattern; - - return this; + return this.FILTER_OK; } // Given a string, find a good token. Tokens which are too generic, @@ -3188,7 +3198,7 @@ const FilterParser = class { const c = pattern.charCodeAt(match.index - 1); if ( c === 0x2A /* '*' */ ) { continue; } } - if ( token.length < this.maxTokenLen ) { + if ( token.length < MAX_TOKEN_LENGTH ) { const lastIndex = this.reToken.lastIndex; if ( lastIndex < pattern.length ) { const c = pattern.charCodeAt(lastIndex); @@ -3226,7 +3236,7 @@ const FilterParser = class { } const { lastIndex } = this.reToken; if ( - token.length < this.maxTokenLen && ( + token.length < MAX_TOKEN_LENGTH && ( lastIndex === pattern.length || pattern.charAt(lastIndex) === '\x01' ) @@ -3283,57 +3293,202 @@ const FilterParser = class { s.charCodeAt(l-1) === 0x2A /* '*' */ && s.charCodeAt(l-2) === 0x2E /* '.' */; } -}; -FilterParser.prototype.DOMAIN_BIT = 0b00000001; -FilterParser.prototype.DENYALLOW_BIT = 0b00000010; -FilterParser.prototype.HEADER_BIT = 0b00000100; -FilterParser.prototype.STRICT_PARTY_BIT = 0b00001000; + compile(parser, writer) { + const r = this.process(parser); -FilterParser.prototype.CSP_BIT = 0b00010000; -FilterParser.prototype.QUERYPRUNE_BIT = 0b00100000; -FilterParser.prototype.REDIRECT_BIT = 0b01000000; + // Ignore non-static network filters + if ( r === this.FILTER_INVALID ) { return false; } -/******************************************************************************/ + // Ignore filters with unsupported options + if ( r === this.FILTER_UNSUPPORTED ) { + const who = writer.properties.get('name') || '?'; + this.error = `Invalid network filter in ${who}: ${parser.raw}`; + return false; + } -FilterParser.parse = (( ) => { - let parser; - let last = 0; - let ttlTimer; + writer.select( + this.badFilter + ? writer.NETWORK_SECTION + COMPILED_BAD_SECTION + : writer.NETWORK_SECTION + ); - const ttlProcess = ( ) => { - ttlTimer = undefined; - if ( (Date.now() - last) > 10000 ) { - parser = undefined; + // Reminder: + // `redirect=` is a combination of a `redirect-rule` filter and a + // block filter. + if ( this.modifyType === parser.OPTTokenRedirect ) { + this.modifyType = parser.OPTTokenRedirectRule; + const parsedBlock = this.clone(); + parsedBlock.modifyType = undefined; + parsedBlock.optionUnitBits &= ~this.REDIRECT_BIT; + parsedBlock.compileToFilter(writer); + } + + this.compileToFilter(writer); + + return true; + } + + compileToFilter(writer) { + // Pure hostnames, use more efficient dictionary lookup + if ( this.isPureHostname && this.hasNoOptionUnits() ) { + this.tokenHash = DOT_TOKEN_HASH; + this.compileToAtomicFilter(this.pattern, writer); return; } - ttlTimer = globals.setTimeout(ttlProcess, 10007); - }; - return p => { - if ( parser === undefined ) { - parser = new FilterParser(p); + this.makeToken(); + + // Special pattern/option cases: + // - `*$domain=...` + // - `|http://$domain=...` + // - `|https://$domain=...` + // The semantic of "just-origin" filters is that contrary to normal + // filters, the original filter is split into as many filters as there + // are entries in the `domain=` option. + if ( this.isJustOrigin() ) { + const tokenHash = this.tokenHash; + if ( this.pattern === '*' || this.pattern.startsWith('http*') ) { + this.tokenHash = ANY_TOKEN_HASH; + } else if /* 'https:' */ ( this.pattern.startsWith('https') ) { + this.tokenHash = ANY_HTTPS_TOKEN_HASH; + } else /* 'http:' */ { + this.tokenHash = ANY_HTTP_TOKEN_HASH; + } + const entities = []; + for ( const hn of this.domainOptList ) { + if ( this.domainIsEntity(hn) === false ) { + this.compileToAtomicFilter(hn, writer); + } else { + entities.push(hn); + } + } + if ( entities.length === 0 ) { return; } + this.tokenHash = tokenHash; + const leftAnchored = (this.anchor & 0b010) !== 0; + for ( const entity of entities ) { + const units = []; + filterPattern.compile(this, units); + if ( leftAnchored ) { units.push(FilterAnchorLeft.compile()); } + filterOrigin.compile([ entity ], true, units); + this.compileToAtomicFilter( + FilterCompositeAll.compile(units), + writer + ); + } + return; } - last = Date.now(); - if ( ttlTimer === undefined ) { - ttlTimer = globals.setTimeout(ttlProcess, 10007); + + const units = []; + + // Pattern + filterPattern.compile(this, units); + + // Anchor + if ( (this.anchor & 0b100) !== 0 ) { + if ( this.isPureHostname ) { + units.push(FilterAnchorHn.compile()); + } else { + units.push(FilterAnchorHnLeft.compile()); + } + } else if ( (this.anchor & 0b010) !== 0 ) { + units.push(FilterAnchorLeft.compile()); } - return parser.parse(p); - }; -})(); + if ( (this.anchor & 0b001) !== 0 ) { + units.push(FilterAnchorRight.compile()); + } + + // Strict partiness + if ( this.strictParty !== 0 ) { + units.push(FilterStrictParty.compile(this)); + } + + // Origin + if ( this.domainOpt !== '' ) { + filterOrigin.compile( + this.domainOptList, + units.length !== 0 && filterClasses[units[0][0]].isSlow === true, + units + ); + } + + // Deny-allow + if ( this.denyallowOpt !== '' ) { + units.push(FilterDenyAllow.compile(this)); + } + + // Header + if ( this.headerOpt !== undefined ) { + units.push(FilterOnHeaders.compile(this)); + this.action |= HEADERS; + } + + // Modifier + // + // IMPORTANT: the modifier unit MUST always appear first in a sequence + if ( this.modifyType !== undefined ) { + units.unshift(FilterModifier.compile(this)); + this.action = (this.action & ~ActionBitsMask) | ModifyAction; + } + + const fdata = units.length === 1 + ? units[0] + : FilterCompositeAll.compile(units); + this.compileToAtomicFilter(fdata, writer); + } + + compileToAtomicFilter(fdata, writer) { + const catBits = this.action | this.party; + let { typeBits } = this; + + // Typeless + if ( typeBits === 0 ) { + writer.push([ catBits, this.tokenHash, fdata ]); + return; + } + // If all network types are set, create a typeless filter + if ( + (typeBits & allNetworkTypesBits) === allNetworkTypesBits && + (this.notTypeBits & documentTypeBit) === 0 + ) { + writer.push([ catBits, this.tokenHash, fdata ]); + typeBits &= ~allNetworkTypesBits; + } + + // One filter per specific types + let bitOffset = 1; + do { + if ( typeBits & 1 ) { + writer.push([ + catBits | (bitOffset << TypeBitsOffset), + this.tokenHash, + fdata + ]); + } + bitOffset += 1; + typeBits >>>= 1; + } while ( typeBits !== 0 ); + } +} + +FilterCompiler.prototype.DOMAIN_BIT = 0b00000001; +FilterCompiler.prototype.DENYALLOW_BIT = 0b00000010; +FilterCompiler.prototype.HEADER_BIT = 0b00000100; +FilterCompiler.prototype.STRICT_PARTY_BIT = 0b00001000; +FilterCompiler.prototype.CSP_BIT = 0b00010000; +FilterCompiler.prototype.QUERYPRUNE_BIT = 0b00100000; +FilterCompiler.prototype.REDIRECT_BIT = 0b01000000; + +FilterCompiler.prototype.FILTER_OK = 0; +FilterCompiler.prototype.FILTER_INVALID = 1; +FilterCompiler.prototype.FILTER_UNSUPPORTED = 2; /******************************************************************************/ /******************************************************************************/ const FilterContainer = function() { - this.MAX_TOKEN_LENGTH = urlTokenizer.MAX_TOKEN_LENGTH; - this.noTokenHash = urlTokenizer.noTokenHash; - this.dotTokenHash = urlTokenizer.dotTokenHash; - this.anyTokenHash = urlTokenizer.anyTokenHash; - this.anyHTTPSTokenHash = urlTokenizer.anyHTTPSTokenHash; - this.anyHTTPTokenHash = urlTokenizer.anyHTTPTokenHash; + this.MAX_TOKEN_LENGTH = MAX_TOKEN_LENGTH; this.optimizeTimerId = undefined; - // As long as CategoryCount is reasonably low, we will use an array to // store buckets using category bits as index. If ever CategoryCount // becomes too large, we can just go back to using a Map. @@ -3416,37 +3571,37 @@ FilterContainer.prototype.freeze = function() { } let iunit = bucket.get(tokenHash); - if ( tokenHash === this.dotTokenHash ) { + if ( tokenHash === DOT_TOKEN_HASH ) { if ( iunit === undefined ) { iunit = filterUnitFromCtor(FilterHostnameDict); - bucket.set(this.dotTokenHash, iunit); + bucket.set(DOT_TOKEN_HASH, iunit); } filterUnits[iunit].add(fdata); continue; } - if ( tokenHash === this.anyTokenHash ) { + if ( tokenHash === ANY_TOKEN_HASH ) { if ( iunit === undefined ) { iunit = filterUnitFromCtor(FilterJustOrigin); - bucket.set(this.anyTokenHash, iunit); + bucket.set(ANY_TOKEN_HASH, iunit); } filterUnits[iunit].add(fdata); continue; } - if ( tokenHash === this.anyHTTPSTokenHash ) { + if ( tokenHash === ANY_HTTPS_TOKEN_HASH ) { if ( iunit === undefined ) { iunit = filterUnitFromCtor(FilterHTTPSJustOrigin); - bucket.set(this.anyHTTPSTokenHash, iunit); + bucket.set(ANY_HTTPS_TOKEN_HASH, iunit); } filterUnits[iunit].add(fdata); continue; } - if ( tokenHash === this.anyHTTPTokenHash ) { + if ( tokenHash === ANY_HTTP_TOKEN_HASH ) { if ( iunit === undefined ) { iunit = filterUnitFromCtor(FilterHTTPJustOrigin); - bucket.set(this.anyHTTPTokenHash, iunit); + bucket.set(ANY_HTTP_TOKEN_HASH, iunit); } filterUnits[iunit].add(fdata); continue; @@ -3502,7 +3657,7 @@ FilterContainer.prototype.optimize = function() { const f = filterUnits[iunit]; if ( f instanceof FilterBucket === false ) { continue; } const optimizeBits = - (th === this.noTokenHash) || (bits & ModifyAction) !== 0 + (th === NO_TOKEN_HASH) || (bits & ModifyAction) !== 0 ? 0b10 : 0b01; const g = f.optimize(optimizeBits); @@ -3514,9 +3669,7 @@ FilterContainer.prototype.optimize = function() { FilterHostnameDict.optimize(); bidiTrieOptimize(); // Be sure unused filters can be garbage collected. - for ( let i = filterUnitWritePtr, n = filterUnits.length; i < n; i++ ) { - filterUnits[i] = null; - } + filterUnits.fill(null, filterUnitWritePtr); }; /******************************************************************************/ @@ -3656,189 +3809,8 @@ FilterContainer.prototype.fromSelfie = function(storage, path) { /******************************************************************************/ -FilterContainer.prototype.compile = function(parser, writer) { - this.error = undefined; - - const parsed = FilterParser.parse(parser); - - // Ignore non-static network filters - if ( parsed.invalid ) { return false; } - - // Ignore filters with unsupported options - if ( parsed.unsupported ) { - const who = writer.properties.get('name') || '?'; - this.error = `Invalid network filter in ${who}: ${parser.raw}`; - return false; - } - - writer.select( - parsed.badFilter - ? writer.NETWORK_SECTION + COMPILED_BAD_SECTION - : writer.NETWORK_SECTION - ); - - // Reminder: - // `redirect=` is a combination of a `redirect-rule` filter and a - // block filter. - if ( parsed.modifyType === parser.OPTTokenRedirect ) { - parsed.modifyType = parser.OPTTokenRedirectRule; - const parsedBlock = parsed.clone(); - parsedBlock.modifyType = undefined; - parsedBlock.optionUnitBits &= ~parsed.REDIRECT_BIT; - this.compileParsed(parsedBlock, writer); - } - - this.compileParsed(parsed, writer); - - return true; -}; - -/******************************************************************************/ - -FilterContainer.prototype.compileParsed = function(parsed, writer) { - // Pure hostnames, use more efficient dictionary lookup - if ( parsed.isPureHostname && parsed.hasNoOptionUnits() ) { - parsed.tokenHash = this.dotTokenHash; - this.compileToAtomicFilter(parsed, parsed.pattern, writer); - return; - } - - parsed.makeToken(); - - // Special pattern/option cases: - // - `*$domain=...` - // - `|http://$domain=...` - // - `|https://$domain=...` - // The semantic of "just-origin" filters is that contrary to normal - // filters, the original filter is split into as many filters as there - // are entries in the `domain=` option. - if ( parsed.isJustOrigin() ) { - const tokenHash = parsed.tokenHash; - if ( parsed.pattern === '*' || parsed.pattern.startsWith('http*') ) { - parsed.tokenHash = this.anyTokenHash; - } else if /* 'https:' */ ( parsed.pattern.startsWith('https') ) { - parsed.tokenHash = this.anyHTTPSTokenHash; - } else /* 'http:' */ { - parsed.tokenHash = this.anyHTTPTokenHash; - } - const entities = []; - for ( const hn of parsed.domainOptList ) { - if ( parsed.domainIsEntity(hn) === false ) { - this.compileToAtomicFilter(parsed, hn, writer); - } else { - entities.push(hn); - } - } - if ( entities.length === 0 ) { return; } - parsed.tokenHash = tokenHash; - const leftAnchored = (parsed.anchor & 0b010) !== 0; - for ( const entity of entities ) { - const units = []; - filterPattern.compile(parsed, units); - if ( leftAnchored ) { units.push(FilterAnchorLeft.compile()); } - filterOrigin.compile([ entity ], true, units); - this.compileToAtomicFilter( - parsed, FilterCompositeAll.compile(units), writer - ); - } - return; - } - - const units = []; - - // Pattern - filterPattern.compile(parsed, units); - - // Anchor - if ( (parsed.anchor & 0b100) !== 0 ) { - if ( parsed.isPureHostname ) { - units.push(FilterAnchorHn.compile()); - } else { - units.push(FilterAnchorHnLeft.compile()); - } - } else if ( (parsed.anchor & 0b010) !== 0 ) { - units.push(FilterAnchorLeft.compile()); - } - if ( (parsed.anchor & 0b001) !== 0 ) { - units.push(FilterAnchorRight.compile()); - } - - // Strict partiness - if ( parsed.strictParty !== 0 ) { - units.push(FilterStrictParty.compile(parsed)); - } - - // Origin - if ( parsed.domainOpt !== '' ) { - filterOrigin.compile( - parsed.domainOptList, - units.length !== 0 && filterClasses[units[0][0]].isSlow === true, - units - ); - } - - // Deny-allow - if ( parsed.denyallowOpt !== '' ) { - units.push(FilterDenyAllow.compile(parsed)); - } - - // Header - if ( parsed.headerOpt !== undefined ) { - units.push(FilterOnHeaders.compile(parsed)); - parsed.action |= HEADERS; - } - - // Modifier - // - // IMPORTANT: the modifier unit MUST always appear first in a sequence - if ( parsed.modifyType !== undefined ) { - units.unshift(FilterModifier.compile(parsed)); - parsed.action = (parsed.action & ~ActionBitsMask) | ModifyAction; - } - - const fdata = units.length === 1 - ? units[0] - : FilterCompositeAll.compile(units); - this.compileToAtomicFilter(parsed, fdata, writer); -}; - -/******************************************************************************/ - -FilterContainer.prototype.compileToAtomicFilter = function( - parsed, - fdata, - writer -) { - const catBits = parsed.action | parsed.party; - let { typeBits } = parsed; - - // Typeless - if ( typeBits === 0 ) { - writer.push([ catBits, parsed.tokenHash, fdata ]); - return; - } - // If all network types are set, create a typeless filter - if ( - (typeBits & allNetworkTypesBits) === allNetworkTypesBits && - (parsed.notTypeBits & documentTypeBit) === 0 - ) { - writer.push([ catBits, parsed.tokenHash, fdata ]); - typeBits &= ~allNetworkTypesBits; - } - - // One filter per specific types - let bitOffset = 1; - do { - if ( typeBits & 1 ) { - writer.push([ - catBits | (bitOffset << TypeBitsOffset), - parsed.tokenHash, - fdata - ]); - } - bitOffset += 1; - typeBits >>>= 1; - } while ( typeBits !== 0 ); +FilterContainer.prototype.createCompiler = function(parser) { + return new FilterCompiler(parser); }; /******************************************************************************/ @@ -4067,7 +4039,7 @@ FilterContainer.prototype.realmMatchString = function( let catBits = 0, iunit = 0; // Pure hostname-based filters - let tokenHash = this.dotTokenHash; + let tokenHash = DOT_TOKEN_HASH; if ( (bucket00 !== undefined) && (iunit = bucket00.get(tokenHash) || 0) !== 0 && @@ -4300,15 +4272,15 @@ FilterContainer.prototype.redirectRequest = function(redirectEngine, fctxt) { return directives; }; -const parseRedirectRequestValue = function(modifier) { +function parseRedirectRequestValue(modifier) { if ( modifier.cache === undefined ) { modifier.cache = StaticFilteringParser.parseRedirectValue(modifier.value); } return modifier.cache; -}; +} -const compareRedirectRequests = function(redirectEngine, a, b) { +function compareRedirectRequests(redirectEngine, a, b) { const { token: atok, priority: aint, bits: abits } = parseRedirectRequestValue(a.modifier); if ( redirectEngine.hasToken(atok) === false ) { return -1; } @@ -4322,7 +4294,7 @@ const compareRedirectRequests = function(redirectEngine, a, b) { if ( (bbits & AllowAction) !== 0 ) { return 1; } } return aint - bint; -}; +} /******************************************************************************/ @@ -4350,7 +4322,7 @@ FilterContainer.prototype.filterQuery = function(fctxt) { out.push(directive); break; } - const { all, bad, name, not, re } = this.parseQueryPruneValue(modifier); + const { all, bad, name, not, re } = parseQueryPruneValue(modifier); if ( bad ) { continue; } if ( all ) { if ( isException === false ) { params.clear(); } @@ -4400,13 +4372,13 @@ FilterContainer.prototype.filterQuery = function(fctxt) { return out; }; -FilterContainer.prototype.parseQueryPruneValue = function(modifier) { +function parseQueryPruneValue(modifier) { if ( modifier.cache === undefined ) { modifier.cache = StaticFilteringParser.parseQueryPruneValue(modifier.value); } return modifier.cache; -}; +} /******************************************************************************/ @@ -4418,18 +4390,9 @@ FilterContainer.prototype.hasQuery = function(fctxt) { /******************************************************************************/ FilterContainer.prototype.toLogData = function() { - if ( this.$filterUnit === 0 ) { return; } - const logData = toLogDataInternal( - this.$catBits, - this.$tokenHash, - this.$filterUnit - ); - logData.source = 'static'; - logData.tokenHash = this.$tokenHash; - logData.result = this.$filterUnit === 0 - ? 0 - : ((this.$catBits & AllowAction) === 0 ? 1 : 2); - return logData; + if ( this.$filterUnit !== 0 ) { + return new LogData(this.$catBits, this.$tokenHash, this.$filterUnit); + } }; /******************************************************************************/ @@ -4626,46 +4589,6 @@ FilterContainer.prototype.filterClassHistogram = function() { /******************************************************************************/ -FilterContainer.prototype.tokenHistograms = async function(requests) { - if ( Array.isArray(requests) === false || requests.length === 0 ) { - console.info('No requests found to benchmark'); - return; - } - - console.info(`Computing token histograms...`); - - const fctxt = new FilteringContext(); - const missTokenMap = new Map(); - const hitTokenMap = new Map(); - const reTokens = /[0-9a-z%]{2,}/g; - - for ( let i = 0; i < requests.length; i++ ) { - const request = requests[i]; - fctxt.setURL(request.url); - fctxt.setDocOriginFromURL(request.frameUrl); - fctxt.setType(request.cpt); - const r = this.matchRequest(fctxt); - for ( let [ keyword ] of request.url.toLowerCase().matchAll(reTokens) ) { - const token = keyword; - if ( r === 0 ) { - missTokenMap.set(token, (missTokenMap.get(token) || 0) + 1); - } else if ( r === 1 ) { - hitTokenMap.set(token, (hitTokenMap.get(token) || 0) + 1); - } - } - } - const customSort = (a, b) => b[1] - a[1]; - const topmisses = Array.from(missTokenMap).sort(customSort).slice(0, 100); - for ( const [ token ] of topmisses ) { - hitTokenMap.delete(token); - } - const tophits = Array.from(hitTokenMap).sort(customSort).slice(0, 100); - console.info('Misses:', JSON.stringify(topmisses)); - console.info('Hits:', JSON.stringify(tophits)); -}; - -/******************************************************************************/ - const staticNetFilteringEngine = new FilterContainer(); export default staticNetFilteringEngine; diff --git a/src/js/storage.js b/src/js/storage.js index 27120a8d3..0d56207b2 100644 --- a/src/js/storage.js +++ b/src/js/storage.js @@ -973,6 +973,7 @@ self.addEventListener('hiddenSettingsChanged', ( ) => { // https://adblockplus.org/en/filters const lineIter = new LineIterator(this.preparseDirectives.prune(rawText)); const parser = new StaticFilteringParser({ expertMode }); + const compiler = staticNetFilteringEngine.createCompiler(parser); parser.setMaxTokenLength(staticNetFilteringEngine.MAX_TOKEN_LENGTH); @@ -1000,12 +1001,12 @@ self.addEventListener('hiddenSettingsChanged', ( ) => { if ( parser.patternHasUnicode() && parser.toASCII() === false ) { continue; } - if ( staticNetFilteringEngine.compile(parser, writer) ) { continue; } - if ( staticNetFilteringEngine.error !== undefined ) { + if ( compiler.compile(parser, writer) ) { continue; } + if ( compiler.error !== undefined ) { logger.writeOne({ realm: 'message', type: 'error', - text: staticNetFilteringEngine.error + text: compiler.error }); } } diff --git a/src/js/tab.js b/src/js/tab.js index 5127bf59f..9d1e7e2ce 100644 --- a/src/js/tab.js +++ b/src/js/tab.js @@ -231,14 +231,13 @@ const onPopupUpdated = (( ) => { popunderHostname, result ) { - if ( - fctxt.filter === undefined || - fctxt.filter !== 'static' || - fctxt.filter.token === staticNetFilteringEngine.noTokenHash - ) { + if ( fctxt.filter === undefined || fctxt.filter !== 'static' ) { return 0; } - if ( fctxt.filter.token === staticNetFilteringEngine.dotTokenHash ) { + if ( fctxt.filter.isUntokenized() ) { + return 0; + } + if ( fctxt.filter.isPureHostname() ) { return result; } const re = new RegExp(fctxt.filter.regex, 'i');