mirror of
https://github.com/gorhill/uBlock.git
synced 2024-11-05 18:32:30 +01:00
Add HNTrie-based filter classes to store origin-only filters
Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
This commit is contained in:
parent
fd9df4b374
commit
3f3a1543ea
@ -137,8 +137,8 @@ const µBlock = (function() { // jshint ignore:line
|
||||
|
||||
// Read-only
|
||||
systemSettings: {
|
||||
compiledMagic: 8, // Increase when compiled format changes
|
||||
selfieMagic: 9 // Increase when selfie format changes
|
||||
compiledMagic: 10, // Increase when compiled format changes
|
||||
selfieMagic: 10 // Increase when selfie format changes
|
||||
},
|
||||
|
||||
restoreBackupSettings: {
|
||||
|
File diff suppressed because it is too large
Load Diff
121
src/js/strie.js
121
src/js/strie.js
@ -46,29 +46,29 @@ const STRIE_CHAR1_SLOT = STRIE_TRIE0_SLOT + 3; // 67 / 268
|
||||
const STRIE_TRIE0_START = STRIE_TRIE0_SLOT + 4 << 2; // 272
|
||||
|
||||
|
||||
const STrieContainer = function(details) {
|
||||
if ( details instanceof Object === false ) { details = {}; }
|
||||
const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
|
||||
this.buf = new Uint8Array(Math.max(len, 131072));
|
||||
this.buf32 = new Uint32Array(this.buf.buffer);
|
||||
this.buf32[STRIE_TRIE0_SLOT] = STRIE_TRIE0_START;
|
||||
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
|
||||
this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536;
|
||||
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
|
||||
};
|
||||
const STrieContainer = class {
|
||||
|
||||
STrieContainer.prototype = {
|
||||
constructor(details) {
|
||||
if ( details instanceof Object === false ) { details = {}; }
|
||||
const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
|
||||
this.buf = new Uint8Array(Math.max(len, 131072));
|
||||
this.buf32 = new Uint32Array(this.buf.buffer);
|
||||
this.buf32[STRIE_TRIE0_SLOT] = STRIE_TRIE0_START;
|
||||
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
|
||||
this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536;
|
||||
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Public methods
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
reset: function() {
|
||||
reset() {
|
||||
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
|
||||
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
|
||||
},
|
||||
}
|
||||
|
||||
matches: function(iroot, a, al) {
|
||||
matches(iroot, a, al) {
|
||||
const ar = a.length;
|
||||
const char0 = this.buf32[STRIE_CHAR0_SLOT];
|
||||
let icell = iroot;
|
||||
@ -102,9 +102,9 @@ STrieContainer.prototype = {
|
||||
if ( icell === 0 || this.buf32[icell+2] === 0 ) { return al; }
|
||||
if ( al === ar ) { return -1; }
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
createOne: function(args) {
|
||||
createOne(args) {
|
||||
if ( Array.isArray(args) ) {
|
||||
return new this.STrieRef(this, args[0], args[1]);
|
||||
}
|
||||
@ -118,13 +118,13 @@ STrieContainer.prototype = {
|
||||
this.buf32[iroot+1] = 0;
|
||||
this.buf32[iroot+2] = 0;
|
||||
return new this.STrieRef(this, iroot, 0);
|
||||
},
|
||||
}
|
||||
|
||||
compileOne: function(trieRef) {
|
||||
compileOne(trieRef) {
|
||||
return [ trieRef.iroot, trieRef.size ];
|
||||
},
|
||||
}
|
||||
|
||||
add: function(iroot, s) {
|
||||
add(iroot, s) {
|
||||
const lschar = s.length;
|
||||
if ( lschar === 0 ) { return 0; }
|
||||
let ischar = 0;
|
||||
@ -221,26 +221,17 @@ STrieContainer.prototype = {
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
optimize: function() {
|
||||
optimize() {
|
||||
this.shrinkBuf();
|
||||
return {
|
||||
byteLength: this.buf.byteLength,
|
||||
char0: this.buf32[STRIE_CHAR0_SLOT],
|
||||
};
|
||||
},
|
||||
}
|
||||
|
||||
fromIterable: function(hostnames, add) {
|
||||
if ( add === undefined ) { add = 'add'; }
|
||||
const trieRef = this.createOne();
|
||||
for ( const hn of hostnames ) {
|
||||
trieRef[add](hn);
|
||||
}
|
||||
return trieRef;
|
||||
},
|
||||
|
||||
serialize: function(encoder) {
|
||||
serialize(encoder) {
|
||||
if ( encoder instanceof Object ) {
|
||||
return encoder.encode(
|
||||
this.buf32.buffer,
|
||||
@ -254,9 +245,9 @@ STrieContainer.prototype = {
|
||||
this.buf32[STRIE_CHAR1_SLOT] + 3 >>> 2
|
||||
)
|
||||
);
|
||||
},
|
||||
}
|
||||
|
||||
unserialize: function(selfie, decoder) {
|
||||
unserialize(selfie, decoder) {
|
||||
const shouldDecode = typeof selfie === 'string';
|
||||
let byteLength = shouldDecode
|
||||
? decoder.decodeSize(selfie)
|
||||
@ -272,23 +263,13 @@ STrieContainer.prototype = {
|
||||
} else {
|
||||
this.buf32.set(selfie);
|
||||
}
|
||||
},
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Class to hold reference to a specific trie
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
STrieRef: function(container, iroot, size) {
|
||||
this.container = container;
|
||||
this.iroot = iroot;
|
||||
this.size = size;
|
||||
},
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Private methods
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
addCell: function(idown, iright, v) {
|
||||
addCell(idown, iright, v) {
|
||||
let icell = this.buf32[STRIE_TRIE1_SLOT];
|
||||
this.buf32[STRIE_TRIE1_SLOT] = icell + 12;
|
||||
icell >>>= 2;
|
||||
@ -296,9 +277,9 @@ STrieContainer.prototype = {
|
||||
this.buf32[icell+1] = iright;
|
||||
this.buf32[icell+2] = v;
|
||||
return icell;
|
||||
},
|
||||
}
|
||||
|
||||
addSegment: function(segment) {
|
||||
addSegment(segment) {
|
||||
const lsegchar = segment.length;
|
||||
if ( lsegchar === 0 ) { return 0; }
|
||||
let char1 = this.buf32[STRIE_CHAR1_SLOT];
|
||||
@ -309,9 +290,9 @@ STrieContainer.prototype = {
|
||||
} while ( i !== lsegchar );
|
||||
this.buf32[STRIE_CHAR1_SLOT] = char1;
|
||||
return (lsegchar << 24) | isegchar;
|
||||
},
|
||||
}
|
||||
|
||||
growBuf: function(trieGrow, charGrow) {
|
||||
growBuf(trieGrow, charGrow) {
|
||||
const char0 = Math.max(
|
||||
(this.buf32[STRIE_TRIE1_SLOT] + trieGrow + STRIE_PAGE_SIZE-1) & ~(STRIE_PAGE_SIZE-1),
|
||||
this.buf32[STRIE_CHAR0_SLOT]
|
||||
@ -322,16 +303,16 @@ STrieContainer.prototype = {
|
||||
this.buf.length
|
||||
);
|
||||
this.resizeBuf(bufLen, char0);
|
||||
},
|
||||
}
|
||||
|
||||
shrinkBuf: function() {
|
||||
shrinkBuf() {
|
||||
const char0 = this.buf32[STRIE_TRIE1_SLOT] + 24;
|
||||
const char1 = char0 + this.buf32[STRIE_CHAR1_SLOT] - this.buf32[STRIE_CHAR0_SLOT];
|
||||
const bufLen = char1 + 256;
|
||||
this.resizeBuf(bufLen, char0);
|
||||
},
|
||||
}
|
||||
|
||||
resizeBuf: function(bufLen, char0) {
|
||||
resizeBuf(bufLen, char0) {
|
||||
bufLen = bufLen + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
|
||||
if (
|
||||
bufLen === this.buf.length &&
|
||||
@ -375,23 +356,35 @@ STrieContainer.prototype = {
|
||||
this.buf32[STRIE_CHAR0_SLOT] = char0;
|
||||
this.buf32[STRIE_CHAR1_SLOT] = char0 + charDataLen;
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
/*******************************************************************************
|
||||
|
||||
STrieContainer.prototype.STrieRef.prototype = {
|
||||
add: function(pattern) {
|
||||
Class to hold reference to a specific trie
|
||||
|
||||
*/
|
||||
|
||||
STrieContainer.prototype.STrieRef = class {
|
||||
constructor(container, iroot, size) {
|
||||
this.container = container;
|
||||
this.iroot = iroot;
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
add(pattern) {
|
||||
if ( this.container.add(this.iroot, pattern) === 1 ) {
|
||||
this.size += 1;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
matches: function(a, al) {
|
||||
}
|
||||
|
||||
matches(a, al) {
|
||||
return this.container.matches(this.iroot, a, al);
|
||||
},
|
||||
[Symbol.iterator]: function() {
|
||||
}
|
||||
|
||||
[Symbol.iterator]() {
|
||||
return {
|
||||
value: undefined,
|
||||
done: false,
|
||||
@ -441,5 +434,5 @@ STrieContainer.prototype.STrieRef.prototype = {
|
||||
forks: [],
|
||||
textDecoder: new TextDecoder()
|
||||
};
|
||||
},
|
||||
}
|
||||
};
|
||||
|
118
src/js/utils.js
118
src/js/utils.js
@ -41,70 +41,101 @@
|
||||
// Benchmark for string-based tokens vs. safe-integer token values:
|
||||
// https://gorhill.github.io/obj-vs-set-vs-map/tokenize-to-str-vs-to-int.html
|
||||
|
||||
µBlock.urlTokenizer = {
|
||||
setURL: function(url) {
|
||||
µBlock.urlTokenizer = new (class {
|
||||
constructor() {
|
||||
this._chars = '0123456789%abcdefghijklmnopqrstuvwxyz';
|
||||
this._validTokenChars = new Uint8Array(128);
|
||||
for ( let i = 0, n = this._chars.length; i < n; i++ ) {
|
||||
this._validTokenChars[this._chars.charCodeAt(i)] = i + 1;
|
||||
}
|
||||
|
||||
this._charsEx = '0123456789%abcdefghijklmnopqrstuvwxyz*.';
|
||||
this._validTokenCharsEx = new Uint8Array(128);
|
||||
for ( let i = 0, n = this._charsEx.length; i < n; i++ ) {
|
||||
this._validTokenCharsEx[this._charsEx.charCodeAt(i)] = i + 1;
|
||||
}
|
||||
|
||||
this.dotTokenHash = this.tokenHashFromString('.');
|
||||
this.anyTokenHash = this.tokenHashFromString('..');
|
||||
this.anyHTTPSTokenHash = this.tokenHashFromString('..https');
|
||||
this.anyHTTPTokenHash = this.tokenHashFromString('..http');
|
||||
this.noTokenHash = this.tokenHashFromString('*');
|
||||
|
||||
this._urlIn = '';
|
||||
this._urlOut = '';
|
||||
this._tokenized = false;
|
||||
this._tokens = [ 0 ];
|
||||
}
|
||||
|
||||
setURL(url) {
|
||||
if ( url !== this._urlIn ) {
|
||||
this._urlIn = url;
|
||||
this._urlOut = url.toLowerCase();
|
||||
this._tokenized = false;
|
||||
}
|
||||
return this._urlOut;
|
||||
},
|
||||
}
|
||||
|
||||
// Tokenize on demand.
|
||||
getTokens: function() {
|
||||
if ( this._tokenized === false ) {
|
||||
this._tokenize();
|
||||
this._tokenized = true;
|
||||
getTokens() {
|
||||
if ( this._tokenized ) { return this._tokens; }
|
||||
let i = this._tokenize();
|
||||
i = this._appendTokenAt(i, this.anyTokenHash, 0);
|
||||
if ( this._urlOut.startsWith('https://') ) {
|
||||
i = this._appendTokenAt(i, this.anyHTTPSTokenHash, 0);
|
||||
} else if ( this._urlOut.startsWith('http://') ) {
|
||||
i = this._appendTokenAt(i, this.anyHTTPTokenHash, 0);
|
||||
}
|
||||
i = this._appendTokenAt(i, this.noTokenHash, 0);
|
||||
this._tokens[i] = 0;
|
||||
this._tokenized = true;
|
||||
return this._tokens;
|
||||
},
|
||||
}
|
||||
|
||||
tokenHashFromString: function(s) {
|
||||
var l = s.length;
|
||||
_appendTokenAt(i, th, ti) {
|
||||
this._tokens[i+0] = th;
|
||||
this._tokens[i+1] = ti;
|
||||
return i + 2;
|
||||
}
|
||||
|
||||
tokenHashFromString(s) {
|
||||
const l = s.length;
|
||||
if ( l === 0 ) { return 0; }
|
||||
if ( l === 1 ) {
|
||||
if ( s === '*' ) { return 63; }
|
||||
if ( s === '.' ) { return 62; }
|
||||
}
|
||||
var vtc = this._validTokenChars,
|
||||
th = vtc[s.charCodeAt(0)];
|
||||
for ( var i = 1; i !== 8 && i !== l; i++ ) {
|
||||
const vtc = this._validTokenCharsEx;
|
||||
let th = vtc[s.charCodeAt(0)];
|
||||
for ( let i = 1; i !== 8 && i !== l; i++ ) {
|
||||
th = th * 64 + vtc[s.charCodeAt(i)];
|
||||
}
|
||||
return th;
|
||||
},
|
||||
}
|
||||
|
||||
stringFromTokenHash: function(th) {
|
||||
stringFromTokenHash(th) {
|
||||
if ( th === 0 ) { return ''; }
|
||||
if ( th === 63 ) { return '*'; }
|
||||
if ( th === 62 ) { return '.'; }
|
||||
const chars = '0123456789%abcdefghijklmnopqrstuvwxyz';
|
||||
let s = '';
|
||||
while ( th > 0 ) {
|
||||
s = `${chars.charAt((th & 0b111111)-1)}${s}`;
|
||||
s = `${this._charsEx.charAt((th & 0b111111)-1)}${s}`;
|
||||
th /= 64;
|
||||
}
|
||||
return s;
|
||||
},
|
||||
}
|
||||
|
||||
// https://github.com/chrisaljoudi/uBlock/issues/1118
|
||||
// We limit to a maximum number of tokens.
|
||||
|
||||
_tokenize: function() {
|
||||
var tokens = this._tokens,
|
||||
url = this._urlOut,
|
||||
l = url.length;
|
||||
if ( l === 0 ) { tokens[0] = 0; return; }
|
||||
_tokenize() {
|
||||
const tokens = this._tokens;
|
||||
let url = this._urlOut;
|
||||
let l = url.length;
|
||||
if ( l === 0 ) { return 0; }
|
||||
if ( l > 2048 ) {
|
||||
url = url.slice(0, 2048);
|
||||
l = 2048;
|
||||
}
|
||||
var i = 0, j = 0, v, n, ti, th,
|
||||
vtc = this._validTokenChars;
|
||||
const vtc = this._validTokenChars;
|
||||
let i = 0, j = 0, v, n, ti, th;
|
||||
for (;;) {
|
||||
for (;;) {
|
||||
if ( i === l ) { tokens[j] = 0; return; }
|
||||
if ( i === l ) { return j; }
|
||||
v = vtc[url.charCodeAt(i++)];
|
||||
if ( v !== 0 ) { break; }
|
||||
}
|
||||
@ -117,25 +148,12 @@
|
||||
th = th * 64 + v;
|
||||
n += 1;
|
||||
}
|
||||
tokens[j++] = th;
|
||||
tokens[j++] = ti;
|
||||
tokens[j+0] = th;
|
||||
tokens[j+1] = ti;
|
||||
j += 2;
|
||||
}
|
||||
},
|
||||
|
||||
_urlIn: '',
|
||||
_urlOut: '',
|
||||
_tokenized: false,
|
||||
_tokens: [ 0 ],
|
||||
_validTokenChars: (function() {
|
||||
var vtc = new Uint8Array(128),
|
||||
chars = '0123456789%abcdefghijklmnopqrstuvwxyz',
|
||||
i = chars.length;
|
||||
while ( i-- ) {
|
||||
vtc[chars.charCodeAt(i)] = i + 1;
|
||||
}
|
||||
return vtc;
|
||||
})()
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user