From 3f299ef62382953e524981e790fd6ff6172b501a Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Fri, 18 Sep 2020 10:23:02 -0400 Subject: [PATCH] Improve validation of hostname in `domain=` and `denyallow` options Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/1249 For "exotic" hostname values, the browser's own API will be used to ultimately validate hostname values. --- src/js/static-filtering-parser.js | 135 +++++++++++++++--------------- src/js/static-net-filtering.js | 64 +++++++------- 2 files changed, 102 insertions(+), 97 deletions(-) diff --git a/src/js/static-filtering-parser.js b/src/js/static-filtering-parser.js index 2a4f823ce..a2860465f 100644 --- a/src/js/static-filtering-parser.js +++ b/src/js/static-filtering-parser.js @@ -108,6 +108,10 @@ const Parser = class { this.reHostsSource = /^[^\x00-\x24\x26-\x29\x2B\x2C\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x7F]+$/; this.reUnicodeChar = /[^\x00-\x7F]/; this.reUnicodeChars = /[^\x00-\x7F]/g; + this.reHostnameLabel = /[^.]+/g; + this.rePlainHostname = /^(?:[\w-]+\.)*[a-z]+$/; + this.rePlainEntity = /^(?:[\w-]+\.)+\*$/; + this.reEntity = /^[^*]+\.\*$/; this.punycoder = new URL(self.location); this.selectorCompiler = new this.SelectorCompiler(this); // TODO: reuse for network filtering analysis @@ -313,7 +317,7 @@ const Parser = class { analyzeExtExtra() { if ( this.hasOptions() ) { const { i, len } = this.optionsSpan; - this.analyzeDomainList(i, i + len, BITComma, 0b11); + this.analyzeDomainList(i, i + len, BITComma, 0b1110); } if ( hasBits(this.flavorBits, BITFlavorUnsupported) ) { this.markSpan(this.patternSpan, BITError); @@ -668,66 +672,62 @@ const Parser = class { } } - // bits: - // 0: can use entity-based hostnames - // 1: can use single wildcard - analyzeDomain(from, to, optionBits) { - const { slices } = this; - let len = to - from; - if ( len === 0 ) { return false; } - const not = hasBits(slices[from], BITTilde); - if ( not ) { - if ( (optionBits & 0b01) === 0 || slices[from+2] > 1 ) { return false; } - from += 3; - len -= 3; + analyzeDomain(from, to, modeBits) { + if ( to === from ) { return false; } + return this.normalizeHostnameValue( + this.strFromSlices(from, to - 3), + modeBits + ) !== undefined; + } + + // Ultimately, let the browser API do the hostname normalization, after + // making some other trivial checks. + // + // modeBits: + // 0: can use wildcard at any position + // 1: can use entity-based hostnames + // 2: can use single wildcard + // 3: can be negated + normalizeHostnameValue(s, modeBits = 0b0000) { + const not = s.charCodeAt(0) === 0x7E /* '~' */; + if ( not && (modeBits & 0b1000) === 0 ) { return; } + let hn = not === false ? s : s.slice(1); + if ( this.rePlainHostname.test(hn) ) { return s; } + const hasWildcard = hn.lastIndexOf('*') !== -1; + if ( hasWildcard ) { + if ( modeBits === 0 ) { return; } + if ( hn.length === 1 ) { + if ( not || (modeBits & 0b0100) === 0 ) { return; } + return s; + } + if ( (modeBits & 0b0010) !== 0 ) { + if ( this.rePlainEntity.test(hn) ) { return s; } + if ( this.reEntity.test(hn) === false ) { return; } + } else if ( (modeBits & 0b0001) === 0 ) { + return; + } + hn = hn.replace(/\*/g, '__asterisk__'); + } + this.punycoder.hostname = '_'; + try { + this.punycoder.hostname = hn; + hn = this.punycoder.hostname; + } catch (_) { + return; + } + if ( hn === '_' || hn === '' ) { return; } + if ( hasWildcard ) { + hn = this.punycoder.hostname.replace(/__asterisk__/g, '*'); } - if ( len === 0 ) { return false; } - // One slice only, check for single asterisk if ( - len === 3 && - not === false && - (optionBits & 0b10) !== 0 && - hasBits(slices[from], BITAsterisk) + (modeBits & 0b0001) === 0 && ( + hn.charCodeAt(0) === 0x2E /* '.' */ || + hn.charCodeAt(hn.length - 1) === 0x2E /* '.' */ + ) ) { - return slices[from+2] === 1; + return; } - // First slice must be regex-equivalent of `\w` - if ( hasNoBits(slices[from], BITRegexWord | BITUnicode) ) { return false; } - // Last slice - if ( len > 3 ) { - const last = to - 3; - if ( hasBits(slices[last], BITAsterisk) ) { - if ( - (optionBits & 0b01) === 0 || - len < 9 || - slices[last+2] > 1 || - hasNoBits(slices[last-3], BITPeriod) - ) { - return false; - } - } else if ( hasNoBits(slices[to-3], BITAlphaNum | BITUnicode) ) { - return false; - } - } - // Middle slices - if ( len > 6 ) { - for ( let i = from + 3; i < to - 3; i += 3 ) { - const bits = slices[i]; - if ( hasNoBits(bits, BITHostname) ) { return false; } - if ( hasBits(bits, BITPeriod) && slices[i+2] > 1 ) { - return false; - } - if ( - hasBits(bits, BITDash) && ( - hasNoBits(slices[i-3], BITRegexWord | BITUnicode) || - hasNoBits(slices[i+3], BITRegexWord | BITUnicode) - ) - ) { - return false; - } - } - } - return true; + return not ? '~' + hn : hn; } slice(raw) { @@ -1081,6 +1081,8 @@ const Parser = class { // Be ready to deal with non-punycode-able Unicode characters. // https://github.com/uBlockOrigin/uBlock-issues/issues/772 // Encode Unicode characters beyond the hostname part. + // Prepend with '*' character to prevent the browser API from refusing to + // punycode -- this occurs when the extracted label starts with a dash. toASCII(dryrun = false) { if ( this.patternHasUnicode() === false ) { return true; } const { i, len } = this.patternSpan; @@ -1090,16 +1092,14 @@ const Parser = class { // Punycode hostname part of the pattern. if ( patternIsRegex === false ) { const match = this.reHostname.exec(pattern); - if ( match === null ) { return true; } - try { - this.punycoder.hostname = match[0].replace(/\*/g, '__asterisk__'); - } catch(ex) { - return false; + if ( match !== null ) { + const hn = match[0].replace(this.reHostnameLabel, s => { + if ( this.reUnicodeChar.test(s) === false ) { return s; } + if ( s.charCodeAt(0) === 0x2D /* '-' */ ) { s = '*' + s; } + return this.normalizeHostnameValue(s, 0b0001) || s; + }); + pattern = hn + pattern.slice(match.index + match[0].length); } - const hn = this.punycoder.hostname; - if ( hn === '' ) { return false; } - const punycoded = hn.replace(/__asterisk__/g, '*'); - pattern = punycoded + pattern.slice(match.index + match[0].length); } // Percent-encode remaining Unicode characters. if ( this.reUnicodeChar.test(pattern) ) { @@ -1755,7 +1755,6 @@ const BITError = 1 << 31; const BITAll = 0xFFFFFFFF; const BITAlphaNum = BITNum | BITAlpha; -const BITRegexWord = BITAlphaNum | BITUnderscore; const BITHostname = BITNum | BITAlpha | BITUppercase | BITDash | BITPeriod | BITUnderscore | BITUnicode; const BITPatternToken = BITNum | BITAlpha | BITPercent; const BITLineComment = BITExclamation | BITHash | BITSquareBracket; @@ -2226,7 +2225,7 @@ const NetOptionsIterator = class { if ( this.interactive && hasBits(descriptor, OPTDomainList) ) { this.parser.analyzeDomainList( lval + 3, i, BITPipe, - (descriptor & 0xFF) === OPTTokenDomain ? 0b01 : 0b00 + (descriptor & 0xFF) === OPTTokenDomain ? 0b1010 : 0b0000 ); } } else { diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 8c8326a39..eef85a7f4 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -20,7 +20,6 @@ */ /* jshint bitwise: false */ -/* global punycode */ 'use strict'; @@ -1120,12 +1119,12 @@ const filterOrigin = (( ) => { this.trieContainer = new µb.HNTrieContainer(); } - compile(domainOpt, prepend, units) { + compile(domainOptList, prepend, units) { const hostnameHits = []; const hostnameMisses = []; const entityHits = []; const entityMisses = []; - for ( const s of FilterParser.domainOptIterator(domainOpt) ) { + for ( const s of domainOptList ) { const len = s.length; const beg = len > 1 && s.charCodeAt(0) === 0x7E ? 1 : 0; const end = len > 2 && @@ -1770,7 +1769,7 @@ const FilterDenyAllow = class { } static compile(details) { - return [ FilterDenyAllow.fid, details.denyallow ]; + return [ FilterDenyAllow.fid, details.denyallowOpt ]; } static fromCompiled(args) { @@ -2074,17 +2073,15 @@ const FILTER_SEQUENCES_MIN = filterSequenceWritePtr; const FilterParser = class { constructor(parser) { this.cantWebsocket = vAPI.cantWebsocket; - this.domainOpt = ''; this.noTokenHash = urlTokenizer.noTokenHash; - this.reBadDomainOptChars = /[+?^${}()[\]\\]/; this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/; - this.reHasUnicode = /[^\x00-\x7F]/; this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/; this.reRegexToken = /[%0-9A-Za-z]{2,}/g; this.reRegexTokenAbort = /[([]/; this.reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; this.reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/; this.reGoodToken = /[%0-9a-z]{1,}/g; + this.domainOptList = []; this.tokenIdToNormalizedType = new Map([ [ parser.OPTTokenCname, bitFromType('cname') ], [ parser.OPTTokenCss, bitFromType('stylesheet') ], @@ -2237,7 +2234,7 @@ const FilterParser = class { this.thirdParty = false; this.party = AnyParty; this.domainOpt = ''; - this.denyallow = ''; + this.denyallowOpt = ''; this.isPureHostname = false; this.isRegex = false; this.redirect = 0; @@ -2291,20 +2288,24 @@ const FilterParser = class { } } - parseHostnameList(parser, s) { - if ( parser.optionHasUnicode() ) { - const hostnames = s.split('|'); - let i = hostnames.length; - while ( i-- ) { - if ( this.reHasUnicode.test(hostnames[i]) ) { - hostnames[i] = punycode.toASCII(hostnames[i]); - } + parseHostnameList(parser, s, modeBits, out = []) { + let beg = 0; + let slen = s.length; + let i = 0; + while ( beg < slen ) { + let end = s.indexOf('|', beg); + if ( end === -1 ) { end = slen; } + const hn = parser.normalizeHostnameValue( + s.slice(beg, end), + modeBits + ); + if ( hn !== undefined ) { + out[i] = hn; i += 1; } - s = hostnames.join('|'); + beg = end + 1; } - // TODO: revisit - if ( this.reBadDomainOptChars.test(s) ) { return ''; } - return s; + out.length = i; + return i === 1 ? out[0] : out.join('|'); } parseOptions(parser) { @@ -2337,12 +2338,17 @@ const FilterParser = class { // Detect and discard filter if domain option contains nonsensical // characters. case parser.OPTTokenDomain: - this.domainOpt = this.parseHostnameList(parser, val); + this.domainOpt = this.parseHostnameList( + parser, + val, + 0b1010, + this.domainOptList + ); if ( this.domainOpt === '' ) { return false; } break; case parser.OPTTokenDenyAllow: - this.denyallow = this.parseHostnameList(parser, val); - if ( this.denyallow === '' ) { return false; } + this.denyallowOpt = this.parseHostnameList(parser, val, 0b0000); + if ( this.denyallowOpt === '' ) { return false; } break; // https://www.reddit.com/r/uBlockOrigin/comments/d6vxzj/ // Add support for `elemhide`. Rarely used but it happens. @@ -2559,7 +2565,7 @@ const FilterParser = class { isJustOrigin() { return this.isRegex === false && this.dataType === undefined && - this.denyallow === '' && + this.denyallowOpt === '' && this.domainOpt !== '' && ( this.pattern === '*' || ( this.anchor === 0b010 && @@ -2961,7 +2967,7 @@ FilterContainer.prototype.compile = function(parser, writer) { if ( parsed.isPureHostname && parsed.domainOpt === '' && - parsed.denyallow === '' && + parsed.denyallowOpt === '' && parsed.dataType === undefined ) { parsed.tokenHash = this.dotTokenHash; @@ -2990,7 +2996,7 @@ FilterContainer.prototype.compile = function(parser, writer) { parsed.tokenHash = this.anyHTTPTokenHash; } const entities = []; - for ( const hn of FilterParser.domainOptIterator(parsed.domainOpt) ) { + for ( const hn of parsed.domainOptList ) { if ( parsed.domainIsEntity(hn) === false ) { this.compileToAtomicFilter(parsed, hn, writer); } else { @@ -3004,7 +3010,7 @@ FilterContainer.prototype.compile = function(parser, writer) { const units = []; filterPattern.compile(parsed, units); if ( leftAnchored ) { units.push(FilterAnchorLeft.compile()); } - filterOrigin.compile(entity, true, units); + filterOrigin.compile([ entity ], true, units); this.compileToAtomicFilter( parsed, FilterCompositeAll.compile(units), writer ); @@ -3034,14 +3040,14 @@ FilterContainer.prototype.compile = function(parser, writer) { // Origin if ( parsed.domainOpt !== '' ) { filterOrigin.compile( - parsed.domainOpt, + parsed.domainOptList, units.length !== 0 && filterClasses[units[0][0]].isSlow === true, units ); } // Deny-allow - if ( parsed.denyallow !== '' ) { + if ( parsed.denyallowOpt !== '' ) { units.push(FilterDenyAllow.compile(parsed)); }