1
0
mirror of https://github.com/gorhill/uBlock.git synced 2024-11-05 18:32:30 +01:00

Add HNTrie-based filter classes to store origin-only filters

Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622

Following STrie-related work in above issue, I noticed that a large
number of filters in EasyList were filters which only had to match
against the document origin. For instance, among just the top 10
most populous buckets, there were four such buckets with over
hundreds of entries each:

- bits: 72, token: "http", 146 entries
- bits: 72, token: "https", 139 entries
- bits: 88, token: "http", 122 entries
- bits: 88, token: "https", 118 entries

These filters in these buckets have to be matched against all
the network requests.

In order to leverage HNTrie for these filters[1], they are now handled
in a special way so as to ensure they all end up in a single HNTrie
(per bucket), which means that instead of scanning hundreds of entries
per URL, there is now a single scan per bucket per URL for these
apply-everywhere filters.

Now, any filter which fulfill ALL the following condition will be
processed in a special manner internally:

- Is of the form `|https://` or `|http://` or `*`; and
- Does have a `domain=` option; and
- Does not have a negated domain in its `domain=` option; and
- Does not have `csp=` option; and
- Does not have a `redirect=` option

If a filter does not fulfill ALL the conditions above, no change
in behavior.

A filter which matches ALL of the above will be processed in a special
manner:

- The `domain=` option will be decomposed so as to create as many
  distinct filter as there is distinct value in the `domain=` option
- This also apply to the `badfilter` version of the filter, which
  means it now become possible to `badfilter` only one of the
  distinct filter without having to `badfilter` all of them.
- The logger will always report these special filters with only a
  single hostname in the `domain=` option.

***

[1] HNTrie is currently WASM-ed on Firefox.
This commit is contained in:
Raymond Hill 2019-04-19 16:33:46 -04:00
parent fd9df4b374
commit 3f3a1543ea
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
4 changed files with 590 additions and 440 deletions

View File

@ -137,8 +137,8 @@ const µBlock = (function() { // jshint ignore:line
// Read-only
systemSettings: {
compiledMagic: 8, // Increase when compiled format changes
selfieMagic: 9 // Increase when selfie format changes
compiledMagic: 10, // Increase when compiled format changes
selfieMagic: 10 // Increase when selfie format changes
},
restoreBackupSettings: {

File diff suppressed because it is too large Load Diff

View File

@ -46,29 +46,29 @@ const STRIE_CHAR1_SLOT = STRIE_TRIE0_SLOT + 3; // 67 / 268
const STRIE_TRIE0_START = STRIE_TRIE0_SLOT + 4 << 2; // 272
const STrieContainer = function(details) {
if ( details instanceof Object === false ) { details = {}; }
const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
this.buf = new Uint8Array(Math.max(len, 131072));
this.buf32 = new Uint32Array(this.buf.buffer);
this.buf32[STRIE_TRIE0_SLOT] = STRIE_TRIE0_START;
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536;
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
};
const STrieContainer = class {
STrieContainer.prototype = {
constructor(details) {
if ( details instanceof Object === false ) { details = {}; }
const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
this.buf = new Uint8Array(Math.max(len, 131072));
this.buf32 = new Uint32Array(this.buf.buffer);
this.buf32[STRIE_TRIE0_SLOT] = STRIE_TRIE0_START;
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536;
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
}
//--------------------------------------------------------------------------
// Public methods
//--------------------------------------------------------------------------
reset: function() {
reset() {
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
},
}
matches: function(iroot, a, al) {
matches(iroot, a, al) {
const ar = a.length;
const char0 = this.buf32[STRIE_CHAR0_SLOT];
let icell = iroot;
@ -102,9 +102,9 @@ STrieContainer.prototype = {
if ( icell === 0 || this.buf32[icell+2] === 0 ) { return al; }
if ( al === ar ) { return -1; }
}
},
}
createOne: function(args) {
createOne(args) {
if ( Array.isArray(args) ) {
return new this.STrieRef(this, args[0], args[1]);
}
@ -118,13 +118,13 @@ STrieContainer.prototype = {
this.buf32[iroot+1] = 0;
this.buf32[iroot+2] = 0;
return new this.STrieRef(this, iroot, 0);
},
}
compileOne: function(trieRef) {
compileOne(trieRef) {
return [ trieRef.iroot, trieRef.size ];
},
}
add: function(iroot, s) {
add(iroot, s) {
const lschar = s.length;
if ( lschar === 0 ) { return 0; }
let ischar = 0;
@ -221,26 +221,17 @@ STrieContainer.prototype = {
}
return 1;
}
},
}
optimize: function() {
optimize() {
this.shrinkBuf();
return {
byteLength: this.buf.byteLength,
char0: this.buf32[STRIE_CHAR0_SLOT],
};
},
}
fromIterable: function(hostnames, add) {
if ( add === undefined ) { add = 'add'; }
const trieRef = this.createOne();
for ( const hn of hostnames ) {
trieRef[add](hn);
}
return trieRef;
},
serialize: function(encoder) {
serialize(encoder) {
if ( encoder instanceof Object ) {
return encoder.encode(
this.buf32.buffer,
@ -254,9 +245,9 @@ STrieContainer.prototype = {
this.buf32[STRIE_CHAR1_SLOT] + 3 >>> 2
)
);
},
}
unserialize: function(selfie, decoder) {
unserialize(selfie, decoder) {
const shouldDecode = typeof selfie === 'string';
let byteLength = shouldDecode
? decoder.decodeSize(selfie)
@ -272,23 +263,13 @@ STrieContainer.prototype = {
} else {
this.buf32.set(selfie);
}
},
//--------------------------------------------------------------------------
// Class to hold reference to a specific trie
//--------------------------------------------------------------------------
STrieRef: function(container, iroot, size) {
this.container = container;
this.iroot = iroot;
this.size = size;
},
}
//--------------------------------------------------------------------------
// Private methods
//--------------------------------------------------------------------------
addCell: function(idown, iright, v) {
addCell(idown, iright, v) {
let icell = this.buf32[STRIE_TRIE1_SLOT];
this.buf32[STRIE_TRIE1_SLOT] = icell + 12;
icell >>>= 2;
@ -296,9 +277,9 @@ STrieContainer.prototype = {
this.buf32[icell+1] = iright;
this.buf32[icell+2] = v;
return icell;
},
}
addSegment: function(segment) {
addSegment(segment) {
const lsegchar = segment.length;
if ( lsegchar === 0 ) { return 0; }
let char1 = this.buf32[STRIE_CHAR1_SLOT];
@ -309,9 +290,9 @@ STrieContainer.prototype = {
} while ( i !== lsegchar );
this.buf32[STRIE_CHAR1_SLOT] = char1;
return (lsegchar << 24) | isegchar;
},
}
growBuf: function(trieGrow, charGrow) {
growBuf(trieGrow, charGrow) {
const char0 = Math.max(
(this.buf32[STRIE_TRIE1_SLOT] + trieGrow + STRIE_PAGE_SIZE-1) & ~(STRIE_PAGE_SIZE-1),
this.buf32[STRIE_CHAR0_SLOT]
@ -322,16 +303,16 @@ STrieContainer.prototype = {
this.buf.length
);
this.resizeBuf(bufLen, char0);
},
}
shrinkBuf: function() {
shrinkBuf() {
const char0 = this.buf32[STRIE_TRIE1_SLOT] + 24;
const char1 = char0 + this.buf32[STRIE_CHAR1_SLOT] - this.buf32[STRIE_CHAR0_SLOT];
const bufLen = char1 + 256;
this.resizeBuf(bufLen, char0);
},
}
resizeBuf: function(bufLen, char0) {
resizeBuf(bufLen, char0) {
bufLen = bufLen + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
if (
bufLen === this.buf.length &&
@ -375,23 +356,35 @@ STrieContainer.prototype = {
this.buf32[STRIE_CHAR0_SLOT] = char0;
this.buf32[STRIE_CHAR1_SLOT] = char0 + charDataLen;
}
},
}
};
/******************************************************************************/
/*******************************************************************************
STrieContainer.prototype.STrieRef.prototype = {
add: function(pattern) {
Class to hold reference to a specific trie
*/
STrieContainer.prototype.STrieRef = class {
constructor(container, iroot, size) {
this.container = container;
this.iroot = iroot;
this.size = size;
}
add(pattern) {
if ( this.container.add(this.iroot, pattern) === 1 ) {
this.size += 1;
return true;
}
return false;
},
matches: function(a, al) {
}
matches(a, al) {
return this.container.matches(this.iroot, a, al);
},
[Symbol.iterator]: function() {
}
[Symbol.iterator]() {
return {
value: undefined,
done: false,
@ -441,5 +434,5 @@ STrieContainer.prototype.STrieRef.prototype = {
forks: [],
textDecoder: new TextDecoder()
};
},
}
};

View File

@ -41,70 +41,101 @@
// Benchmark for string-based tokens vs. safe-integer token values:
// https://gorhill.github.io/obj-vs-set-vs-map/tokenize-to-str-vs-to-int.html
µBlock.urlTokenizer = {
setURL: function(url) {
µBlock.urlTokenizer = new (class {
constructor() {
this._chars = '0123456789%abcdefghijklmnopqrstuvwxyz';
this._validTokenChars = new Uint8Array(128);
for ( let i = 0, n = this._chars.length; i < n; i++ ) {
this._validTokenChars[this._chars.charCodeAt(i)] = i + 1;
}
this._charsEx = '0123456789%abcdefghijklmnopqrstuvwxyz*.';
this._validTokenCharsEx = new Uint8Array(128);
for ( let i = 0, n = this._charsEx.length; i < n; i++ ) {
this._validTokenCharsEx[this._charsEx.charCodeAt(i)] = i + 1;
}
this.dotTokenHash = this.tokenHashFromString('.');
this.anyTokenHash = this.tokenHashFromString('..');
this.anyHTTPSTokenHash = this.tokenHashFromString('..https');
this.anyHTTPTokenHash = this.tokenHashFromString('..http');
this.noTokenHash = this.tokenHashFromString('*');
this._urlIn = '';
this._urlOut = '';
this._tokenized = false;
this._tokens = [ 0 ];
}
setURL(url) {
if ( url !== this._urlIn ) {
this._urlIn = url;
this._urlOut = url.toLowerCase();
this._tokenized = false;
}
return this._urlOut;
},
}
// Tokenize on demand.
getTokens: function() {
if ( this._tokenized === false ) {
this._tokenize();
this._tokenized = true;
getTokens() {
if ( this._tokenized ) { return this._tokens; }
let i = this._tokenize();
i = this._appendTokenAt(i, this.anyTokenHash, 0);
if ( this._urlOut.startsWith('https://') ) {
i = this._appendTokenAt(i, this.anyHTTPSTokenHash, 0);
} else if ( this._urlOut.startsWith('http://') ) {
i = this._appendTokenAt(i, this.anyHTTPTokenHash, 0);
}
i = this._appendTokenAt(i, this.noTokenHash, 0);
this._tokens[i] = 0;
this._tokenized = true;
return this._tokens;
},
}
tokenHashFromString: function(s) {
var l = s.length;
_appendTokenAt(i, th, ti) {
this._tokens[i+0] = th;
this._tokens[i+1] = ti;
return i + 2;
}
tokenHashFromString(s) {
const l = s.length;
if ( l === 0 ) { return 0; }
if ( l === 1 ) {
if ( s === '*' ) { return 63; }
if ( s === '.' ) { return 62; }
}
var vtc = this._validTokenChars,
th = vtc[s.charCodeAt(0)];
for ( var i = 1; i !== 8 && i !== l; i++ ) {
const vtc = this._validTokenCharsEx;
let th = vtc[s.charCodeAt(0)];
for ( let i = 1; i !== 8 && i !== l; i++ ) {
th = th * 64 + vtc[s.charCodeAt(i)];
}
return th;
},
}
stringFromTokenHash: function(th) {
stringFromTokenHash(th) {
if ( th === 0 ) { return ''; }
if ( th === 63 ) { return '*'; }
if ( th === 62 ) { return '.'; }
const chars = '0123456789%abcdefghijklmnopqrstuvwxyz';
let s = '';
while ( th > 0 ) {
s = `${chars.charAt((th & 0b111111)-1)}${s}`;
s = `${this._charsEx.charAt((th & 0b111111)-1)}${s}`;
th /= 64;
}
return s;
},
}
// https://github.com/chrisaljoudi/uBlock/issues/1118
// We limit to a maximum number of tokens.
_tokenize: function() {
var tokens = this._tokens,
url = this._urlOut,
l = url.length;
if ( l === 0 ) { tokens[0] = 0; return; }
_tokenize() {
const tokens = this._tokens;
let url = this._urlOut;
let l = url.length;
if ( l === 0 ) { return 0; }
if ( l > 2048 ) {
url = url.slice(0, 2048);
l = 2048;
}
var i = 0, j = 0, v, n, ti, th,
vtc = this._validTokenChars;
const vtc = this._validTokenChars;
let i = 0, j = 0, v, n, ti, th;
for (;;) {
for (;;) {
if ( i === l ) { tokens[j] = 0; return; }
if ( i === l ) { return j; }
v = vtc[url.charCodeAt(i++)];
if ( v !== 0 ) { break; }
}
@ -117,25 +148,12 @@
th = th * 64 + v;
n += 1;
}
tokens[j++] = th;
tokens[j++] = ti;
tokens[j+0] = th;
tokens[j+1] = ti;
j += 2;
}
},
_urlIn: '',
_urlOut: '',
_tokenized: false,
_tokens: [ 0 ],
_validTokenChars: (function() {
var vtc = new Uint8Array(128),
chars = '0123456789%abcdefghijklmnopqrstuvwxyz',
i = chars.length;
while ( i-- ) {
vtc[chars.charCodeAt(i)] = i + 1;
}
return vtc;
})()
};
}
})();
/******************************************************************************/