From 51bb22097cd0cdffece56fec55a12bfa84adf2f6 Mon Sep 17 00:00:00 2001 From: gorhill Date: Fri, 19 Sep 2014 10:59:44 -0400 Subject: [PATCH] this fixes #235 --- js/background.js | 2 +- js/liquid-dict.js | 4 +- js/net-filtering.js | 199 +++++++++++++++++++++++++++++--------------- js/storage.js | 29 +------ 4 files changed, 138 insertions(+), 96 deletions(-) diff --git a/js/background.js b/js/background.js index 4fd232448..d07cd61e7 100644 --- a/js/background.js +++ b/js/background.js @@ -89,7 +89,7 @@ return { firstUpdateAfter: 5 * oneMinute, nextUpdateAfter: 7 * oneHour, - selfieMagic: 'ccolmudazpvm', + selfieMagic: 'rniacaqskjwz', selfieAfter: 7 * oneMinute, pageStores: {}, diff --git a/js/liquid-dict.js b/js/liquid-dict.js index 3c81408a2..cd15a72f8 100644 --- a/js/liquid-dict.js +++ b/js/liquid-dict.js @@ -36,7 +36,7 @@ var LiquidDict = function() { // Somewhat arbitrary: I need to come up with hard data to know at which // point binary search is better than indexOf. - this.cutoff = 500; + this.cutoff = 256; }; /******************************************************************************/ @@ -127,7 +127,7 @@ LiquidDict.prototype.test = function(word) { return bucket[word] !== undefined; } if ( bucket.charAt(0) === ' ' ) { - return bucket.indexOf(' ' + word + ' ') >= 0; + return bucket.indexOf(' ' + word + ' ') !== -1; } // binary search var len = word.length; diff --git a/js/net-filtering.js b/js/net-filtering.js index 33fdf95e9..dc029e1b0 100644 --- a/js/net-filtering.js +++ b/js/net-filtering.js @@ -80,6 +80,7 @@ var reIgnoreComment = /^\[|^!/; var reHostnameRule = /^[0-9a-z][0-9a-z.-]+[0-9a-z]$/; var reHostnameToken = /^[0-9a-z]+/g; var reGoodToken = /[%0-9a-z]{2,}/g; +var reURLPostHostnameAnchors = /[\/?#]/; var typeNameToTypeValue = { 'stylesheet': 2 << 9, @@ -166,6 +167,9 @@ Filters family tree: - anchored at end - no hostname - specific hostname + - anchored within hostname + - no hostname + - specific hostname (not implemented) - one wildcard - anywhere @@ -177,6 +181,9 @@ Filters family tree: - anchored at end - no hostname - specific hostname + - anchored within hostname + - no hostname (not implemented) + - specific hostname (not implemented) - more than one wildcard - anywhere @@ -188,6 +195,9 @@ Filters family tree: - anchored at end - no hostname - specific hostname + - anchored within hostname + - no hostname (not implemented) + - specific hostname (not implemented) */ @@ -458,6 +468,41 @@ FilterPlainRightAnchoredHostname.fromSelfie = function(s) { /******************************************************************************/ +// https://github.com/gorhill/uBlock/issues/235 +// The filter is left-anchored somewhere within the hostname part of the URL. + +var FilterPlainHnAnchored = function(s) { + this.s = s; +}; + +FilterPlainHnAnchored.prototype.match = function(url, tokenBeg) { + if ( url.substr(tokenBeg, this.s.length) !== this.s ) { + return false; + } + // Valid only if hostname-valid characters to the left of token + var pos = url.indexOf('://'); + return pos !== -1 && + reURLPostHostnameAnchors.test(url.slice(pos + 3, tokenBeg)) === false; +}; + +FilterPlainHnAnchored.prototype.fid = 'h|a'; + +FilterPlainHnAnchored.prototype.toString = function() { + return '||' + this.s; +}; + +FilterPlainHnAnchored.prototype.toSelfie = function() { + return this.s; +}; + +FilterPlainHnAnchored.fromSelfie = function(s) { + return new FilterPlainHnAnchored(s); +}; + +// https://www.youtube.com/watch?v=71YS6xDB-E4 + +/******************************************************************************/ + // With a single wildcard, regex is not optimal. // See: // http://jsperf.com/regexp-vs-indexof-abp-miss/3 @@ -770,6 +815,24 @@ FilterManyWildcardsHostname.fromSelfie = function(s) { /******************************************************************************/ +// TODO: Some buckets may grow quite large (see histogram excerpt below). +// Evaluate the gain from having an internal dictionary for such large +// buckets: the key would be created by concatenating the char preceding and +// following the token. The dict would contain smaller buckets, and there +// would be a special bucket for those filters for which a prefix, suffix, or +// both is missing. +// I used to do this, but at a higher level, during tokenization, and in the +// end I found out the overhead was to much. I believe it will be a gain +// here because the special treatment would be only for a few specific tokens, +// not systematically done for all tokens. + +// key=Ȁ ad count=655 +// key=Ȁ ads count=432 +// key=̀ doubleclick count= 94 +// key=Ȁ adv count= 89 +// key=Ȁ google count= 67 +// key=Ȁ banner count= 55 + var FilterBucket = function(a, b) { this.f = null; this.filters = []; @@ -842,6 +905,9 @@ var makeFilter = function(details, tokenBeg) { if ( details.anchor > 0 ) { return new FilterPlainRightAnchored(s); } + if ( details.hostnameAnchored ) { + return new FilterPlainHnAnchored(s); + } if ( tokenBeg === 0 ) { return new FilterPlainPrefix0(s); } @@ -983,7 +1049,8 @@ FilterParser.prototype.reset = function() { this.f = ''; this.firstParty = false; this.fopts = ''; - this.hostname = false; + this.hostnameAnchored = false; + this.hostnamePure = false; this.hostnames.length = 0; this.notHostname = false; this.thirdParty = false; @@ -1060,6 +1127,12 @@ FilterParser.prototype.parse = function(s) { // important! this.reset(); + if ( reHostnameRule.test(s) ) { + this.f = s; + this.hostnamePure = this.hostnameAnchored = true; + return this; + } + // element hiding filter? if ( s.indexOf('##') >= 0 || s.indexOf('#@') >= 0 ) { this.elemHiding = true; @@ -1087,7 +1160,7 @@ FilterParser.prototype.parse = function(s) { // hostname anchoring if ( s.slice(0, 2) === '||' ) { - this.hostname = true; + this.hostnameAnchored = true; s = s.slice(2); } @@ -1110,7 +1183,12 @@ FilterParser.prototype.parse = function(s) { s = s.replace(/\*\*+/g, '*'); // remove leading and trailing wildcards - this.f = trimChar(s, '*'); + s = trimChar(s, '*'); + + // pure hostname-based? + this.hostnamePure = this.hostnameAnchored && reHostnameRule.test(s); + + this.f = s; if ( !this.fopts ) { return this; @@ -1274,6 +1352,7 @@ FilterContainer.prototype.fromSelfie = function(selfie) { '|ah': FilterPlainLeftAnchoredHostname, 'a|': FilterPlainRightAnchored, 'a|h': FilterPlainRightAnchoredHostname, + 'h|a': FilterPlainHnAnchored, '*': FilterSingleWildcard, '*h': FilterSingleWildcardHostname, '0*': FilterSingleWildcardPrefix0, @@ -1345,30 +1424,6 @@ FilterContainer.prototype.makeCategoryKey = function(category) { /******************************************************************************/ -FilterContainer.prototype.addAnyPartyHostname = function(hostname) { - if ( this.blockedAnyPartyHostnames.add(hostname) ) { - this.acceptedCount++; - this.blockFilterCount++; - return true; - } - this.duplicateCount++; - return false; -}; - -/******************************************************************************/ - -FilterContainer.prototype.add3rdPartyHostname = function(hostname) { - if ( this.blocked3rdPartyHostnames.add(hostname) ) { - this.acceptedCount++; - this.blockFilterCount++; - return true; - } - this.duplicateCount++; - return false; -}; - -/******************************************************************************/ - FilterContainer.prototype.add = function(s) { // ORDER OF TESTS IS IMPORTANT! @@ -1396,31 +1451,35 @@ FilterContainer.prototype.add = function(s) { return false; } + this.processedFilterCount += 1; + this.acceptedCount += 1; + + // Pure hostnames, use more efficient liquid dict + if ( parsed.hostnamePure && parsed.action === BlockAction ) { + if ( parsed.fopts === '' ) { + if ( this.blockedAnyPartyHostnames.add(parsed.f) ) { + this.blockFilterCount++; + } else { + this.duplicateCount++; + } + return true; + } + if ( parsed.fopts === 'third-party' ) { + if ( this.blocked3rdPartyHostnames.add(parsed.f) ) { + this.blockFilterCount++; + } else { + this.duplicateCount++; + } + return true; + } + } + if ( this.duplicates[s] ) { this.duplicateCount++; return false; } this.duplicates[s] = true; - this.processedFilterCount += 1; - - // Ignore optionless hostname rules, these will be taken care of by µBlock. - if ( parsed.hostname && parsed.fopts === '' && parsed.action === BlockAction && reHostnameRule.test(parsed.f) ) { - return false; - } - - this.acceptedCount += 1; - - // Pure third-party hostnames, use more efficient liquid dict - if ( reHostnameRule.test(parsed.f) && parsed.hostname && parsed.action === BlockAction ) { - if ( parsed.fopts === 'third-party' ) { - return this.blocked3rdPartyHostnames.add(parsed.f); - } - if ( parsed.fopts === '' ) { - return this.blockedAnyPartyHostnames.add(parsed.f); - } - } - var r = this.addFilter(parsed); if ( r === false ) { return false; @@ -1439,16 +1498,22 @@ FilterContainer.prototype.add = function(s) { FilterContainer.prototype.addFilter = function(parsed) { // TODO: avoid duplicates - var matches = parsed.hostname ? findHostnameToken(parsed.f) : findFirstGoodToken(parsed.f); + var matches = parsed.hostnameAnchored ? + findHostnameToken(parsed.f) : + findFirstGoodToken(parsed.f); if ( !matches || !matches[0].length ) { return false; } var tokenBeg = matches.index; - var tokenEnd = parsed.hostname ? reHostnameToken.lastIndex : reGoodToken.lastIndex; + var tokenEnd = parsed.hostnameAnchored ? + reHostnameToken.lastIndex : + reGoodToken.lastIndex; var filter; var i = parsed.hostnames.length; + // Applies to specific domains + if ( i !== 0 && !parsed.notHostname ) { while ( i-- ) { filter = makeHostnameFilter(parsed, tokenBeg, parsed.hostnames[i]); @@ -1466,6 +1531,8 @@ FilterContainer.prototype.addFilter = function(parsed) { return true; } + // Applies to all domains, with exception(s) + // https://github.com/gorhill/uBlock/issues/191 // Invert the purpose of the filter for negated hostnames if ( i !== 0 && parsed.notHostname ) { @@ -1498,6 +1565,8 @@ FilterContainer.prototype.addFilter = function(parsed) { return true; } + // Applies to all domains without exceptions + filter = makeFilter(parsed, tokenBeg); if ( !filter ) { return false; @@ -1630,18 +1699,15 @@ FilterContainer.prototype.matchTokens = function(url) { // specialized to deal with other complex filters. FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) { - // Quick test first - if ( this.blockedAnyPartyHostnames.test(requestHostname) ) { - return '||' + requestHostname + '^'; - } - // Check parent hostnames if quick test failed - var hostnames = µb.URI.parentHostnamesFromHostname(requestHostname); - for ( var i = 0, n = hostnames.length; i < n; i++ ) { - if ( this.blockedAnyPartyHostnames.test(hostnames[i]) ) { - return '||' + hostnames[i] + '^'; + var pos; + while ( this.blockedAnyPartyHostnames.test(requestHostname) !== true ) { + pos = requestHostname.indexOf('.'); + if ( pos === -1 ) { + return false; } + requestHostname = requestHostname.slice(pos + 1); } - return false; + return '||' + requestHostname + '^'; }; /******************************************************************************/ @@ -1655,18 +1721,15 @@ FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) { // specialized to deal with other complex filters. FilterContainer.prototype.match3rdPartyHostname = function(requestHostname) { - // Quick test first - if ( this.blocked3rdPartyHostnames.test(requestHostname) ) { - return '||' + requestHostname + '^$third-party'; - } - // Check parent hostnames if quick test failed - var hostnames = µb.URI.parentHostnamesFromHostname(requestHostname); - for ( var i = 0, n = hostnames.length; i < n; i++ ) { - if ( this.blocked3rdPartyHostnames.test(hostnames[i]) ) { - return '||' + hostnames[i] + '^$third-party'; + var pos; + while ( this.blocked3rdPartyHostnames.test(requestHostname) !== true ) { + pos = requestHostname.indexOf('.'); + if ( pos === -1 ) { + return false; } + requestHostname = requestHostname.slice(pos + 1); } - return false; + return '||' + requestHostname + '^$third-party'; }; /******************************************************************************/ diff --git a/js/storage.js b/js/storage.js index e25bc82fd..b7e17c630 100644 --- a/js/storage.js +++ b/js/storage.js @@ -339,9 +339,7 @@ var parseCosmeticFilters = this.userSettings.parseAllABPHideFilters; var duplicateCount = netFilteringEngine.duplicateCount + cosmeticFilteringEngine.duplicateCount; var acceptedCount = netFilteringEngine.acceptedCount + cosmeticFilteringEngine.acceptedCount; - var reLocalhost = /(^|\s)(localhost\.localdomain|localhost|local|broadcasthost|0\.0\.0\.0|127\.0\.0\.1|::1|fe80::1%lo0)(?=\s|$)/g; - var reAdblockFilter = /^[^a-z0-9:]|[^a-z0-9]$|[^a-z0-9_:.-]/; - var reAdblockHostFilter = /^\|\|([a-z0-9.-]+[a-z0-9])\^?$/; + var reLocalhost = /(?:^|\s)(?:localhost\.localdomain|localhost|local|broadcasthost|0\.0\.0\.0|127\.0\.0\.1|::1|fe80::1%lo0)(?=\s|$)/g; var reAsciiSegment = /^[\x21-\x7e]+$/; var matches; var lineBeg = 0, lineEnd, currentLineBeg; @@ -392,7 +390,8 @@ // The filter is whatever sequence of printable ascii character without // whitespaces matches = reAsciiSegment.exec(line); - if ( !matches || matches.length === 0 ) { + if ( matches === null ) { + //console.debug('µBlock.mergeUbiquitousBlacklist(): skipping "%s"', lineRaw); continue; } @@ -404,27 +403,7 @@ continue; } - line = matches[0]; - - // Likely an ABP net filter? - if ( reAdblockFilter.test(line) ) { - if ( netFilteringEngine.add(line) ) { - continue; - } - // rhill 2014-01-22: Transpose possible Adblock Plus-filter syntax - // into a plain hostname if possible. - matches = reAdblockHostFilter.exec(line); - if ( !matches || matches.length < 2 ) { - continue; - } - line = matches[1]; - } - - if ( line === '' ) { - continue; - } - - netFilteringEngine.addAnyPartyHostname(line); + netFilteringEngine.add(matches[0]); } // For convenience, store the number of entries for this