1
0
mirror of https://github.com/gorhill/uBlock.git synced 2024-11-06 02:42:33 +01:00
uBlock/js/abp-hide-filters.js
2014-08-02 11:40:27 -04:00

739 lines
22 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*******************************************************************************
µBlock - a Chromium browser extension to block requests.
Copyright (C) 2014 Raymond Hill
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see {http://www.gnu.org/licenses/}.
Home: https://github.com/gorhill/uBlock
*/
/* jshint bitwise: false */
/* global µBlock */
/******************************************************************************/
µBlock.abpHideFilters = (function(){
/******************************************************************************/
var µb = µBlock;
//var filterTestCount = 0;
//var bucketTestCount = 0;
/******************************************************************************/
/*
var histogram = function(label, buckets) {
var h = [],
bucket;
for ( var k in buckets ) {
if ( buckets.hasOwnProperty(k) === false ) {
continue;
}
bucket = buckets[k];
h.push({
k: k,
n: bucket instanceof FilterBucket ? bucket.filters.length : 1
});
}
console.log('Histogram %s', label);
var total = h.length;
h.sort(function(a, b) { return b.n - a.n; });
// Find indices of entries of interest
var target = 3;
for ( var i = 0; i < total; i++ ) {
if ( h[i].n === target ) {
console.log('\tEntries with only %d filter(s) start at index %s (key = "%s")', target, i, h[i].k);
target -= 1;
}
}
h = h.slice(0, 50);
h.forEach(function(v) {
console.log('\tkey="%s" count=%d', v.k, v.n);
});
console.log('\tTotal buckets count: %d', total);
};
*/
/******************************************************************************/
// Pure id- and class-based filters
// Examples:
// #A9AdsMiddleBoxTop
// .AD-POST
var FilterPlain = function(s) {
this.s = s;
};
FilterPlain.prototype.retrieve = function(s, out) {
if ( s === this.s ) {
out.push(this.s);
}
};
/******************************************************************************/
// Id- and class-based filters with extra selector stuff following.
// Examples:
// #center_col > div[style="font-size:14px;margin-right:0;min-height:5px"] ...
// #adframe:not(frameset)
// .l-container > #fishtank
var FilterPlainMore = function(s) {
this.s = s;
};
FilterPlainMore.prototype.retrieve = function(s, out) {
if ( s === this.s.slice(0, s.length) ) {
out.push(this.s);
}
};
/******************************************************************************/
// Any selector specific to a hostname
// Examples:
// search.snapdo.com###ABottomD
// facebook.com##.-cx-PRIVATE-fbAdUnit__root
// sltrib.com###BLContainer + div[style="height:90px;"]
// myps3.com.au##.Boxer[style="height: 250px;"]
// lindaikeji.blogspot.com##a > img[height="600"]
// japantimes.co.jp##table[align="right"][width="250"]
// mobilephonetalk.com##[align="center"] > b > a[href^="http://tinyurl.com/"]
var FilterHostname = function(s, hostname) {
this.s = s;
this.hostname = hostname;
};
FilterHostname.prototype.retrieve = function(hostname, out) {
if ( hostname.slice(-this.hostname.length) === this.hostname ) {
out.push(this.s);
}
};
/******************************************************************************/
// Any selector specific to an entity
// Examples:
// google.*###cnt #center_col > #res > #topstuff > .ts
var FilterEntity = function(s, entity) {
this.s = s;
this.entity = entity;
};
FilterEntity.prototype.retrieve = function(entity, out) {
if ( entity.slice(-this.entity.length) === this.entity ) {
out.push(this.s);
}
};
/******************************************************************************/
/******************************************************************************/
// TODO: evaluate the gain (if any) from avoiding the use of an array for when
// there are only two filters (or three, etc.). I suppose there is a specific
// number of filters below which using an array is more of an overhead than
// using a couple of property members.
// i.e. FilterBucket2, FilterBucket3, FilterBucketN.
var FilterBucket = function(a, b) {
this.filters = [a, b];
};
FilterBucket.prototype.add = function(a) {
this.filters.push(a);
};
FilterBucket.prototype.retrieve = function(s, out) {
var i = this.filters.length;
//filterTestCount += i - 1;
while ( i-- ) {
this.filters[i].retrieve(s, out);
}
};
/******************************************************************************/
/******************************************************************************/
var FilterParser = function() {
this.s = '';
this.prefix = '';
this.suffix = '';
this.anchor = 0;
this.filterType = '#';
this.hostnames = [];
this.invalid = false;
this.unsupported = false;
this.reParser = /^\s*([^#]*)(##|#@#)(.+)\s*$/;
this.rePlain = /^([#.][\w-]+)/;
this.rePlainMore = /^[#.][\w-]+[^\w-]/;
this.reElement = /^[a-z]/i;
};
/******************************************************************************/
FilterParser.prototype.reset = function() {
this.s = '';
this.prefix = '';
this.suffix = '';
this.anchor = '';
this.filterType = '#';
this.hostnames = [];
this.invalid = false;
return this;
};
/******************************************************************************/
FilterParser.prototype.parse = function(s) {
// important!
this.reset();
var matches = this.reParser.exec(s);
if ( matches === null || matches.length !== 4 ) {
this.invalid = true;
return this;
}
// Remember original string
this.s = s;
this.prefix = matches[1];
this.anchor = matches[2];
this.suffix = matches[3];
// 2014-05-23:
// https://github.com/gorhill/httpswitchboard/issues/260
// Any sequence of `#` longer than one means the line is not a valid
// cosmetic filter.
if ( this.suffix.indexOf('##') >= 0 ) {
this.invalid = true;
return this;
}
this.filterType = this.anchor.charAt(1);
if ( this.prefix !== '' ) {
this.hostnames = this.prefix.split(/\s*,\s*/);
}
return this;
};
/******************************************************************************/
FilterParser.prototype.isPlainMore = function() {
return this.rePlainMore.test(this.suffix);
};
/******************************************************************************/
FilterParser.prototype.isElement = function() {
return this.reElement.test(this.suffix);
};
/******************************************************************************/
FilterParser.prototype.extractPlain = function() {
var matches = this.rePlain.exec(this.suffix);
if ( matches && matches.length === 2 ) {
return matches[1];
}
return '';
};
/******************************************************************************/
/******************************************************************************/
var FilterContainer = function() {
this.filterParser = new FilterParser();
this.reset();
};
/******************************************************************************/
// Reset all, thus reducing to a minimum memory footprint of the context.
FilterContainer.prototype.reset = function() {
this.filterParser.reset();
this.frozen = false;
this.acceptedCount = 0;
this.processedCount = 0;
this.genericFilters = {};
this.hostnameFilters = {};
this.entityFilters = {};
this.hideUnfiltered = [];
this.hideLowGenerics = {};
this.hideHighGenerics = [];
this.donthideUnfiltered = [];
this.donthideLowGenerics = {};
this.donthideHighGenerics = [];
this.rejected = [];
this.duplicates = {};
this.duplicateCount = 0;
};
/******************************************************************************/
FilterContainer.prototype.add = function(s) {
s = s.trim();
var parsed = this.filterParser.parse(s);
if ( parsed.invalid ) {
return false;
}
this.processedCount += 1;
if ( this.duplicates[s] ) {
this.duplicateCount++;
return false;
}
this.duplicates[s] = true;
//if ( s === 'mail.google.com##.nH.adC > .nH > .nH > .u5 > .azN' ) {
// debugger;
//}
// hostname-based filters: with a hostname, narrowing is good enough, no
// need to further narrow.
if ( parsed.hostnames.length ) {
return this.addPrefixedFilter(parsed);
}
// no specific hostname, narrow using class or id.
var selectorType = parsed.suffix.charAt(0);
if ( selectorType === '#' || selectorType === '.' ) {
return this.addPlainFilter(parsed);
}
// no specific hostname, no class, no id.
// TO IMPLEMENT
// My idea of implementation so far is to return a pre-built container
// of these very generic filter, and let the content script sort out
// what it needs from it. Filters in that category are mostly
// `a[href^="..."]` kind of filters.
// Content script side, the unsorted container of selectors could be used
// in a querySelector() to figure which rules apply (if any), or they
// could just all be injected undiscriminately (not good).
if ( parsed.filterType === '#' ) {
this.hideUnfiltered.push(parsed.suffix);
} else {
this.donthideUnfiltered.push(parsed.suffix);
}
this.acceptedCount += 1;
return true;
};
/******************************************************************************/
FilterContainer.prototype.freezeGenerics = function(what) {
var selectors = this[what + 'Unfiltered'];
//console.log('%d highly generic selectors:\n', selectors.length, selectors.sort().join('\n'));
// ["title"] and ["alt"] will be sorted out manually, these are the most
// common generic selectors, aka "low generics". The rest will be put in
// the high genericity bin.
var lowGenerics = {};
var lowGenericCount = 0;
var re = /^(([a-z]*)\[(alt|title)="([^"]+)"\])$/;
var i = selectors.length;
var selector, matches;
while ( i-- ) {
selector = selectors[i];
matches = re.exec(selector);
if ( !matches ) {
continue;
}
lowGenerics[matches[1]] = true;
lowGenericCount++;
selectors.splice(i, 1);
}
// Chunksize is a compromise between number of selectors per chunk (the
// number of selectors querySelector() will have to deal with), and the
// number of chunks (the number of times querySelector() will have to
// be called.)
// Benchmarking shows this is a hot spot performance-wise for "heavy"
// sites (like say, Sports Illustrated, good test case). Not clear what
// better can be done at this point, I doubt javascript-side code can beat
// querySelector().
var chunkSize = Math.max(selectors.length >>> 3, 8);
var chunkified = [], chunk;
for (;;) {
chunk = selectors.splice(0, chunkSize);
if ( chunk.length === 0 ) {
break;
}
chunkified.push(chunk.join(','));
}
this[what + 'LowGenerics'] = lowGenerics;
this[what + 'LowGenericCount'] = lowGenericCount;
this[what + 'HighGenerics'] = chunkified;
this[what + 'Unfiltered'] = [];
};
/******************************************************************************/
FilterContainer.prototype.freeze = function() {
this.freezeGenerics('hide');
this.freezeGenerics('donthide');
this.filterParser.reset();
// console.debug('Number of duplicate cosmetic filters skipped:', this.duplicateCount);
this.duplicates = {};
this.frozen = true;
//console.log('µBlock> adp-hide-filters.js: %d filters accepted', this.acceptedCount);
//console.log('µBlock> adp-hide-filters.js: %d filters processed', this.processedCount);
//console.log('µBlock> adp-hide-filters.js: coverage is %s%', (this.acceptedCount * 100 / this.processedCount).toFixed(1));
//console.log('µBlock> adp-hide-filters.js: unfiltered hide selectors:', this.hideUnfiltered);
//console.log('µBlock> adp-hide-filters.js: unfiltered dont hide selectors:', this.donthideUnfiltered);
//console.log('µBlock> adp-hide-filters.js: rejected selectors:', this.rejected);
// histogram('allFilters', this.filters);
};
/******************************************************************************/
// Is
// 3 unicode chars
// | | |
//
// 00000000 TTTTTTTT PP PP PP PP PP PP PP PP
// | |
// | |
// | |
// | |
// | |
// | +-- ls 2-bit of 8 token chars
// |
// |
// +-- filter type ('#'=hide '@'=unhide)
//
var makeHash = function(type, token) {
// Ref: Given a URL, returns a unique 4-character long hash string
// Based on: FNV32a
// http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source
// The rest is custom, suited for µBlock.
var len = token.length;
var i2 = len >> 1;
var i4 = len >> 2;
var i8 = len >> 3;
var hint = (0x811c9dc5 ^ token.charCodeAt(0)) >>> 0;
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(i8);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(i4);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(i4+i8);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(i2);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(i2+i8);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(i2+i4);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
hint ^= token.charCodeAt(len-1);
hint += (hint<<1) + (hint<<4) + (hint<<7) + (hint<<8) + (hint<<24);
hint >>>= 0;
return String.fromCharCode(type.charCodeAt(0), hint & 0xFFFF);
};
/**
Histogram for above hash generator:
Histogram allFilters
Entries with only 3 filter(s) start at index 2706 (key = "#ꍵ")
Entries with only 2 filter(s) start at index 4349 (key = "#냶")
Entries with only 1 filter(s) start at index 6896 (key = "#퀛")
key="#싣" count=141
key="#隁" count=57
key="#Ꚇ" count=48
key="#" count=45
key="#캃" count=36
key="#력" count=33
key="#끻" count=30
key="#" count=26
key="#" count=25
key="#Ꮳ" count=24
key="#鵲" count=23
key="#䙇" count=20
key="#ḇ" count=19
key="#睅" count=19
key="#㔽" count=19
key="#뻧" count=18
key="#䕀" count=18
key="#퉫" count=17
key="#筙" count=17
key="#㮰" count=17
key="#鯛" count=16
key="#꛿" count=16
key="#꣱" count=16
key="#ü" count=16
key="#告" count=16
key="#╡" count=16
key="#㰁" count=16
key="#৹" count=16
key="#镳" count=15
key="#碇" count=15
key="#৾" count=15
key="#貿" count=15
key="#š" count=15
key="#" count=15
key="#" count=14
key="#ຏ" count=14
key="#낶" count=14
key="#瑻" count=14
key="#ৡ" count=14
key="#" count=13
key="#ᯋ" count=13
key="#⼒" count=13
key="#腫" count=13
key="#겚" count=13
key="#耏" count=13
key="#匋" count=13
key="#튦" count=13
key="#ﰹ" count=13
key="#㭴" count=13
key="#" count=13
Total buckets count: 12098
*/
/******************************************************************************/
FilterContainer.prototype.addPlainFilter = function(parsed) {
// Verify whether the plain selector is followed by extra selector stuff
if ( parsed.isPlainMore() ) {
return this.addPlainMoreFilter(parsed);
}
var f = new FilterPlain(parsed.suffix);
var hash = makeHash(parsed.filterType, parsed.suffix);
this.addFilterEntry(this.genericFilters, hash, f);
this.acceptedCount += 1;
};
/******************************************************************************/
FilterContainer.prototype.addPlainMoreFilter = function(parsed) {
var selectorSuffix = parsed.extractPlain();
if ( selectorSuffix === '' ) {
return;
}
var f = new FilterPlainMore(parsed.suffix);
var hash = makeHash(parsed.filterType, selectorSuffix);
this.addFilterEntry(this.genericFilters, hash, f);
this.acceptedCount += 1;
};
/******************************************************************************/
// rhill 2014-05-20: When a domain exists, just specify a generic selector.
FilterContainer.prototype.addHostnameFilter = function(hostname, parsed) {
var f = new FilterHostname(parsed.suffix, hostname);
var hash = makeHash(parsed.filterType, µb.URI.domainFromHostname(hostname));
this.addFilterEntry(this.hostnameFilters, hash, f);
};
/******************************************************************************/
FilterContainer.prototype.addEntityFilter = function(hostname, parsed) {
var f = new FilterEntity(parsed.suffix, hostname.slice(0, -2));
var entity = hostname.slice(0, -2);
var pos = entity.lastIndexOf('.');
if ( pos !== -1 ) {
entity = entity.slice(pos + 1);
}
var hash = makeHash(parsed.filterType, entity);
this.addFilterEntry(this.entityFilters, hash, f);
};
/******************************************************************************/
// rhill 2014-05-20: When a domain exists, just specify a generic selector.
FilterContainer.prototype.addPrefixedFilter = function(parsed) {
var hostnames = parsed.hostnames;
var i = hostnames.length, hostname;
while ( i-- ) {
hostname = hostnames[i];
if ( !hostname ) {
continue;
}
// rhill 2014-07-13: new filter class: entity.
if ( hostname.slice(-2) === '.*' ) {
this.addEntityFilter(hostname, parsed);
} else {
this.addHostnameFilter(hostname, parsed);
}
}
this.acceptedCount += 1;
};
/******************************************************************************/
FilterContainer.prototype.addFilterEntry = function(filterDict, hash, f) {
var bucket = filterDict[hash];
if ( bucket === undefined ) {
filterDict[hash] = f;
} else if ( bucket instanceof FilterBucket ) {
bucket.add(f);
} else {
filterDict[hash] = new FilterBucket(bucket, f);
}
};
/******************************************************************************/
FilterContainer.prototype.retrieveGenericSelectors = function(request) {
if ( µb.userSettings.parseAllABPHideFilters !== true ) {
return;
}
if ( !request.selectors ) {
return;
}
//quickProfiler.start('FilterContainer.retrieve()');
//filterTestCount = 0;
//bucketTestCount = 0;
var r = {
hide: [],
donthide: [],
hideLowGenerics: this.hideLowGenerics,
hideLowGenericCount: this.hideLowGenericCount,
hideHighGenerics: this.hideHighGenerics,
donthideLowGenerics: this.donthideLowGenerics,
donthideLowGenericCount: this.donthideLowGenericCount,
donthideHighGenerics: this.donthideHighGenerics
};
var hash, bucket;
var hideSelectors = r.hide;
var selectors = request.selectors;
var i = selectors.length;
var selector;
while ( i-- ) {
selector = selectors[i];
if ( !selector ) {
continue;
}
hash = makeHash('#', selector);
if ( bucket = this.genericFilters[hash] ) {
//bucketTestCount += 1;
//filterTestCount += 1;
bucket.retrieve(selector, hideSelectors);
}
}
//quickProfiler.stop();
//console.log(
// 'µBlock> abp-hide-filters.js: %d selectors in => %d selectors out',
// request.selectors.length,
// r.hide.length + r.donthide.length
//);
return r;
};
/******************************************************************************/
FilterContainer.prototype.retrieveDomainSelectors = function(request) {
if ( µb.userSettings.parseAllABPHideFilters !== true ) {
return;
}
if ( !request.locationURL ) {
return;
}
//quickProfiler.start('FilterContainer.retrieve()');
//filterTestCount = 0;
//bucketTestCount = 0;
var hostname = µb.URI.hostnameFromURI(request.locationURL);
var domain = µb.URI.domainFromHostname(hostname);
var pos = domain.indexOf('.');
var r = {
domain: domain,
entity: pos === -1 ? domain : domain.slice(0, pos - domain.length),
hide: [],
donthide: []
};
var bucket;
var hash = makeHash('#', r.domain);
if ( bucket = this.hostnameFilters[hash] ) {
//bucketTestCount += 1;
//filterTestCount += 1;
bucket.retrieve(hostname, r.hide);
}
hash = makeHash('#', r.entity);
if ( bucket = this.entityFilters[hash] ) {
//bucketTestCount += 1;
//filterTestCount += 1;
bucket.retrieve(pos === -1 ? domain : hostname.slice(0, pos - domain.length), r.hide);
}
hash = makeHash('@', r.domain);
if ( bucket = this.hostnameFilters[hash] ) {
//bucketTestCount += 1;
//filterTestCount += 1;
bucket.retrieve(hostname, r.donthide);
}
//quickProfiler.stop();
//console.log(
// 'µBlock> abp-hide-filters.js: "%s" => %d selectors out',
// request.locationURL,
// r.hide.length + r.donthide.length
//);
return r;
};
/******************************************************************************/
FilterContainer.prototype.getFilterCount = function() {
return this.acceptedCount;
};
/******************************************************************************/
return new FilterContainer();
/******************************************************************************/
})();
/******************************************************************************/