From d7d544cda040bb79aa3ab2591521bb1c43f57aaa Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Sat, 3 Nov 2018 08:58:46 -0300 Subject: [PATCH] Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries --- src/js/background.js | 10 +- src/js/hntrie.js | 974 +- src/js/start.js | 9 +- src/js/static-net-filtering.js | 238 +- src/js/storage.js | 45 +- src/js/wasm/README.md | 24 + src/js/wasm/hntrie.wasm | Bin 0 -> 337 bytes src/js/wasm/hntrie.wat | 200 + test/hnset-benchmark.html | 479 + test/hntrie-test.html | 45866 +++++++++++++++++++++++++++++++ 10 files changed, 47177 insertions(+), 668 deletions(-) create mode 100644 src/js/wasm/README.md create mode 100644 src/js/wasm/hntrie.wasm create mode 100644 src/js/wasm/hntrie.wat create mode 100644 test/hnset-benchmark.html create mode 100644 test/hntrie-test.html diff --git a/src/js/background.js b/src/js/background.js index 671cee95d..75b4b5a50 100644 --- a/src/js/background.js +++ b/src/js/background.js @@ -33,12 +33,12 @@ if ( vAPI.webextFlavor === undefined ) { /******************************************************************************/ -var µBlock = (function() { // jshint ignore:line +const µBlock = (function() { // jshint ignore:line - var oneSecond = 1000, - oneMinute = 60 * oneSecond; + const oneSecond = 1000, + oneMinute = 60 * oneSecond; - var hiddenSettingsDefault = { + const hiddenSettingsDefault = { assetFetchTimeout: 30, autoUpdateAssetFetchPeriod: 120, autoUpdatePeriod: 7, @@ -56,7 +56,7 @@ var µBlock = (function() { // jshint ignore:line userResourcesLocation: 'unset' }; - var whitelistDefault = [ + const whitelistDefault = [ 'about-scheme', 'chrome-extension-scheme', 'chrome-scheme', diff --git a/src/js/hntrie.js b/src/js/hntrie.js index 96ce32aea..86b0eb009 100644 --- a/src/js/hntrie.js +++ b/src/js/hntrie.js @@ -1,7 +1,7 @@ /******************************************************************************* uBlock Origin - a browser extension to block requests. - Copyright (C) 2017 Raymond Hill + Copyright (C) 2017-present Raymond Hill This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,6 +19,9 @@ Home: https://github.com/gorhill/uBlock */ +/* globals WebAssembly */ +/* exported hnTrieManager */ + 'use strict'; /******************************************************************************* @@ -37,561 +40,492 @@ For example, `www.abc.com` is deemed matching `abc.com`, because the former is a subdomain of the latter. The opposite is of course not true. - The resulting read-only trie created as a result of using HNTrieBuilder are + The resulting read-only tries created as a result of using hnTrieManager are simply just typed arrays filled with integers. The matching algorithm is just a matter of reading/comparing these integers, and further using them as indices in the array as a way to move around in the trie. - There is still place for optimizations. Specifically, I could force the - strings to be properly sorted so that `HNTrie.matches` could bail earlier - when trying to find a matching descendant -- but suspect the gain would be - marginal, if measurable. - [1] To solve */ -var HNTrieBuilder = function() { - this.reset(); -}; +const hnTrieManager = { + tree: null, + treesz: 0, + trie: new Uint8Array(65536), + trie32: null, + triesz: 256, // bytes 0-254: decoded needle, byte 255: needle length + id: 0, + needle: '', + wasmLoading: null, + wasmMemory: null, + cleanupToken: 0, + cleanupTimer: undefined, -/******************************************************************************* - - A plain javascript array is used to build the trie. It will be casted into - the appropriate read-only TypedArray[1] at vacuum time. - - [1] Depending on the size: Uint8Array, Uint16Array, or Uint32Array. - -*/ - -HNTrieBuilder.prototype.reset = function() { - this.buf = []; - this.bufsz = 0; - this.buf[0] = 0; - this.buf[1] = 0; - this.buf[2] = 0; - return this; -}; - -/******************************************************************************* - - Helpers for convenience. - -*/ - -HNTrieBuilder.fromDomainOpt = function(domainOpt) { - var builder = new HNTrieBuilder(); - builder.fromDomainOpt(domainOpt); - return builder.vacuum(); -}; - -HNTrieBuilder.fromIterable = function(hostnames) { - var builder = new HNTrieBuilder(); - builder.fromIterable(hostnames); - return builder.vacuum(); -}; - -HNTrieBuilder.print = function(trie) { - var buf = trie.buf, - i = 0, cc = [], ic, indent = 0, - forks = []; - for (;;) { - if ( buf[i] !== 0 ) { - forks.push(i, indent); + reset: function() { + if ( this.wasmMemory === null && this.trie.byteLength > 65536 ) { + this.trie = new Uint8Array(65536); + this.trie32 = new Uint32Array(this.trie.buffer); + } else { + this.trie.fill(0); } - cc.unshift(buf[i+2]); - for ( ic = 0; ic < buf[i+3]; ic++ ) { - cc.unshift(buf[i+4+ic]); + this.triesz = 256; + this.needle = ''; + this.id += 1; + }, + + readyToUse: function() { + return this.wasmLoading instanceof Promise + ? this.wasmLoading + : Promise.resolve(); + }, + + isValidRef: function(ref) { + return ref !== null && ref.id === this.id; + }, + + setNeedle: function(needle) { + if ( needle !== this.needle ) { + const buf = this.trie; + let i = needle.length; + buf[255] = i; + while ( i-- ) { + buf[i] = needle.charCodeAt(i); + } + this.needle = needle; } - console.log('\xB7'.repeat(indent) + String.fromCharCode.apply(null, cc)); - indent += cc.length; - cc = []; - i = buf[i+1]; - if ( i === 0 ) { - if ( forks.length === 0 ) { break; } - indent = forks.pop(); - i = forks.pop(); - i = buf[i]; + return this; + }, + + matchesJS: function(itrie) { + const buf = this.trie; + const buf32 = this.trie32; + let ineedle = buf[255]; + for (;;) { + ineedle -= 1; + const nchar = ineedle === -1 ? 0 : buf[ineedle]; + for (;;) { + const tchar = buf[itrie+8]; // quick test: first character + if ( tchar === nchar ) { break; } + if ( tchar === 0 && nchar === 0x2E ) { return 1; } + itrie = buf32[itrie >>> 2]; + if ( itrie === 0 ) { return 0; } // no more descendants + } + if ( nchar === 0 ) { return 1; } + let lxtra = buf[itrie+9]; // length of extra charaters + if ( lxtra !== 0 ) { // cell is only one character + if ( lxtra > ineedle ) { return 0; } + let ixtra = itrie + 10; + lxtra += ixtra; + do { + ineedle -= 1; + if ( buf[ineedle] !== buf[ixtra] ) { return 0; } + ixtra += 1; + } while ( ixtra !== lxtra ); + } + itrie = buf32[itrie + 4 >>> 2]; + if ( itrie === 0 ) { + return ineedle === 0 || buf[ineedle-1] === 0x2E ? 1 : 0; + } } - } -}; + }, + matchesWASM: null, + matches: null, -/******************************************************************************* + start: function() { + if ( this.trie32 === null ) { + this.trie32 = new Uint32Array(this.trie.buffer); + } + this.treesz = 0; + if ( this.tree === null ) { + this.tree = new Uint32Array(16384); + } + this.tree[0] = 0; + this.tree[1] = 0; + this.tree[2] = 0; + }, - Since this trie is specialized for matching hostnames, the stored strings are - reversed internally, because of hostname comparison logic: + /*************************************************************************** - Correct matching: - index 0123456 + Since this trie is specialized for matching hostnames, the stored + strings are reversed internally, because of hostname comparison logic: + + Correct matching: + index 0123456 + abc.com + | + www.abc.com + index 01234567890 + + Incorrect matching (typically used for plain strings): + index 0123456 abc.com - | - www.abc.com - index 01234567890 + | + www.abc.com + index 01234567890 - Incorrect matching (typically used for plain strings): - index 0123456 - abc.com - | - www.abc.com - index 01234567890 + */ -*/ - -HNTrieBuilder.prototype.add = function(hn) { - var ichar = hn.length - 1; - if ( ichar === -1 ) { return; } - var c = hn.charCodeAt(ichar), - i = 0, inext; - for (;;) { - if ( this.buf[i+2] !== c ) { // match not found - inext = this.buf[i]; // move to descendant - if ( inext === 0 ) { break; } // no descendant - } else { // match found - if ( c === 0 ) { return; } - inext = this.buf[i+1]; // move to sibling - ichar -= 1; - c = ichar === -1 ? 0 : hn.charCodeAt(ichar); + add: function(hn) { + // 256 * 3 + 3 = 771 + if ( this.treesz + 771 >= this.tree.length ) { + this.growTree(); } - i = inext; - } - // Any new string added will always cause a new descendant to be created. - // The only time this is not the case is when trying to store a string - // which is already in the trie. - inext = this.bufsz; // new descendant cell - this.buf[i] = inext; - this.buf[inext+0] = 0; // jump index to descendant - this.buf[inext+1] = 0; // jump index to sibling - this.buf[inext+2] = c; // character code - this.bufsz += 3; - if ( c === 0 ) { return; } // character zero is always last cell - do { - i = inext; // new branch sprouting made from - ichar -= 1; // all characters left to store - c = ichar === -1 ? 0 : hn.charCodeAt(ichar); - inext = this.bufsz; - this.buf[i+1] = inext; - this.buf[inext+0] = 0; - this.buf[inext+1] = 0; - this.buf[inext+2] = c; - this.bufsz += 3; - } while ( c!== 0 ); -}; + let ichar = hn.length - 1; + if ( ichar === -1 ) { return; } + let c = hn.charCodeAt(ichar), + i = 0, inext; + for (;;) { + if ( this.tree[i+2] !== c ) { // match not found + inext = this.tree[i]; // move to descendant + if ( inext === 0 ) { break; } // no descendant + } else { // match found + if ( c === 0 ) { return; } + inext = this.tree[i+1]; // move to sibling + ichar -= 1; + c = ichar === -1 ? 0 : hn.charCodeAt(ichar); + } + i = inext; + } + // Any new string added will always cause a new descendant to be + // created. The only time this is not the case is when trying to + // store a string which is already in the trie. + inext = this.treesz; // new descendant cell + this.tree[i] = inext; + this.tree[inext+0] = 0; // jump index to descendant + this.tree[inext+1] = 0; // jump index to sibling + this.tree[inext+2] = c; // character code + this.treesz += 3; + if ( c === 0 ) { return; } // character zero is always last cell + do { + i = inext; // new branch sprouting made from + ichar -= 1; // all characters left to store + c = ichar === -1 ? 0 : hn.charCodeAt(ichar); + inext = this.treesz; + this.tree[i+1] = inext; + this.tree[inext+0] = 0; + this.tree[inext+1] = 0; + this.tree[inext+2] = c; + this.treesz += 3; + } while ( c!== 0 ); + }, -/******************************************************************************* + growTree: function() { + let tree = new Uint32Array(this.tree.length + 16384); + tree.set(this.tree); + this.tree = tree; + }, - Not using String.split('|') to avoid memory churning. + /*************************************************************************** -*/ + Before vacuuming, each cell is 3 entry-long: + - Jump index to descendant (if any) + - Jump index to sibling (if any) + - character code -HNTrieBuilder.prototype.fromDomainOpt = function(hostnames) { - return this.fromIterable(hostnames.split('|')); -}; + All strings stored in the un-vacuumed trie are zero-terminated, and the + character zero does occupy a cell like any other character. Let's + use _ to represent character zero for sake of comments. The asterisk + will be used to highlight a node with a descendant. -HNTrieBuilder.prototype.fromIterable = function(hostnames) { - var hns = Array.from(hostnames).sort(function(a, b) { - return a.length - b.length; - }); - // https://github.com/gorhill/uBlock/issues/3328 - // Must sort from shortest to longest. - for ( var hn of hns ) { - this.add(hn); - } - return this; + Cases, before vacuuming: + + abc.com, abc.org: 16 cells + * + _ -- a -- b -- c -- . -- c -- o -- m + _ -- a -- b -- c -- . -- o -- r -- g + + abc.com, xyz.com: 12 cells + * + _ -- a -- b -- c -- . -- c -- o -- m + _ -- x -- y -- z + + ab.com, b.com: 8 cells + * + _ -- a -- b -- . -- c -- o -- m + _ + + b.com, ab.com: 8 cells + * + _ -- b -- . -- c -- o -- m + _ -- a + + Vacuuming is the process of merging sibling cells with no descendants. + Cells with descendants can't be merged. + + Each time we arrive at the end of a horizontal branch (sibling jump + index is 0), we walk back to the nearest previous node with descendants, + and repeat the process. Since there is no index information on where to + come back, a stack is used to remember cells with descendants (descendant + jump index is non zero) encountered on the way + + After vacuuming, each cell is 4+n entry-long: + - Jump index to descendant (if any) + - Jump index to sibling (if any) + - character code + - length of merged character code(s) + + Cases, after vacuuming: + + abc.com, abc.org: 2 cells + * + [abc.co]m + [abc.or]g + + abc.com, xyz.com: 3 cells + * + [ab]c -- [.co]m + [xy]z + + ab.com, b.com: 3 cells + * + a -- [b.co]m + _ + + b.com, ab.com: 3 cells + * + _ -- [b.co]m + a + + It's possible for a character zero cell to have descendants. + + It's not possible for a character zero cell to have next siblings. + + This will have to be taken into account during both vacuuming and + matching. + + Character zero cells with no descendant are discarded during vacuuming. + Character zero cells with a descendant, or character zero cells which + are a decendant are kept into the vacuumed trie. + + A vacuumed trie is very efficient memory- and lookup-wise, but is also + read-only: no string can be added or removed. The read-only trie is + really just a self-sufficient array of integers, and can easily be + exported/imported as a JSON array. It is theoretically possible to + "decompile" a trie (vacuumed or not) into the set of strings originally + added to it (in the order they were added with the current + implementation), but so far I do not need this feature. + + New vacuum output array format: + byte 0..2: offset to descendant + byte 3..5: offset to sibling + byte 6: first character + byte 7: number of extra characters + Offset & count values are little-endian. + + 3 + 3 + 1 + 1 = 8 bytes for one character, otherwise + 3 + 3 + 1 + 1 + n = 8 + n bytes for one + n character(s) + */ + + finish: function() { + if ( this.treesz === 0 ) { return null; } + const input = this.tree, + iout0 = this.triesz, + forks = []; + let output = this.trie, + output32 = this.trie32, + iout1 = iout0, + iout2 = output.byteLength, + iin = 0; + for (;;) { + if ( (iout1 + 266) >= iout2 ) { + this.growTrie(); + output = this.trie; + output32 = this.trie32; + iout2 = output.byteLength; + } + let iout = iout1; + output32[iout >>> 2] = 0; + output32[iout + 4 >>> 2] = 0; + output[iout+8] = input[iin+2]; // first character + output[iout+9] = 0; // extra character count + iout1 += 10; + if ( input[iin] !== 0 ) { // cell with descendant + forks.push(iout, iin); // defer processing + } + for (;;) { // merge sibling cell(s) + iin = input[iin+1]; // sibling cell + if ( iin === 0 ) { break; } // no more sibling cell + if ( input[iin] !== 0 ) { break; } // cell with a descendant + if ( input[iin+2] === 0 ) { break; } // don't merge \x00 + output[iout1] = input[iin+2]; // add character data + iout1 += 1; + } + if ( iout1 !== iout + 10 ) { // cells were merged + output[iout+9] = iout1 - iout - 10; // so adjust count + } + iout1 = (iout1 + 3) & ~3; // align to i32 + if ( iin !== 0 && input[iin] !== 0 ) { // can't merge this cell + output32[iout + 4 >>> 2] = iout1; + continue; + } + if ( forks.length === 0 ) { break; } // no more descendants: bye + iin = forks.pop(); // process next descendant + iout = forks.pop(); + iin = input[iin]; + output32[iout >>> 2] = iout1; + } + this.triesz = iout1; + this.cleanupAsync(); + return new HNTrieRef(iout0); + }, + + fromIterable: function(hostnames) { + this.start(); + const hns = Array.from(hostnames).sort(function(a, b) { + return a.length - b.length; + }); + // https://github.com/gorhill/uBlock/issues/3328 + // Must sort from shortest to longest. + for ( let hn of hns ) { + this.add(hn); + } + return this.finish(); + }, + + fromDomainOpt: function(hostnames) { + return this.fromIterable(hostnames.split('|')); + }, + + growTrie: function() { + let trie; + if ( this.wasmMemory === null ) { + trie = new Uint8Array(this.trie.byteLength + 65536); + trie.set(this.trie); + } else { + this.wasmMemory.grow(1); + trie = new Uint8Array(this.wasmMemory.buffer); + } + this.trie = trie; + this.trie32 = new Uint32Array(this.trie.buffer); + }, + + cleanupAsync: function() { + if ( this.cleanupTimer === undefined ) { + this.cleanupToken = this.triesz; + this.cleanupTimer = setTimeout(( ) => { + this.cleanupTimer = undefined; + if ( this.cleanupToken !== this.triesz ) { + this.cleanupAsync(); + } else { + this.tree = null; + } + }, 30000); + } + }, + + // For debugging purpose + // TODO: currently broken, needs to be fixed as per new buffer format. + /* + print: function(offset) { + let i = offset, cc = [], indent = 0, + forks = []; + for (;;) { + if ( buf[i] !== 0 ) { + forks.push(i, indent); + } + cc.unshift(buf[i+2]); + for ( let ic = 0; ic < buf[i+3]; ic++ ) { + cc.unshift(buf[i+4+ic]); + } + console.log('\xB7'.repeat(indent) + String.fromCharCode.apply(null, cc)); + indent += cc.length; + cc = []; + i = buf[i+1]; + if ( i === 0 ) { + if ( forks.length === 0 ) { break; } + indent = forks.pop(); + i = forks.pop(); + i = buf[i]; + } + } + }, + */ }; /******************************************************************************/ -HNTrieBuilder.prototype.matches = function(needle) { - var ichar = needle.length - 1, - buf = this.buf, i = 0, c; - for (;;) { - c = ichar === -1 ? 0 : needle.charCodeAt(ichar); - while ( buf[i+2] !== c ) { - i = buf[i]; - if ( i === 0 ) { return false; } - } - if ( c === 0 ) { return true; } - i = buf[i+1]; - if ( i === 0 ) { return c === 0x2E; } - ichar -= 1; +(function() { + // Default to javascript version. + hnTrieManager.matches = hnTrieManager.matchesJS; + + if ( + typeof WebAssembly !== 'object' || + typeof WebAssembly.instantiateStreaming !== 'function' + ) { + return; } -}; -/******************************************************************************* - - Before vacuuming, each cell is 3 entry-long: - - Jump index to descendant (if any) - - Jump index to sibling (if any) - - character code - - All strings stored in the un-vacuumed trie are zero-terminated, and the - character zero does occupy a cell like any other character. Let's use _ to - represent character zero for sake of comments. The asterisk will be used to - highlight a node with a descendant. - - Cases, before vacuuming: - - abc.com, abc.org: 16 cells - * - _ -- a -- b -- c -- . -- c -- o -- m - _ -- a -- b -- c -- . -- o -- r -- g - - abc.com, xyz.com: 12 cells - * - _ -- a -- b -- c -- . -- c -- o -- m - _ -- x -- y -- z - - ab.com, b.com: 8 cells - * - _ -- a -- b -- . -- c -- o -- m - _ - - b.com, ab.com: 8 cells - * - _ -- b -- . -- c -- o -- m - _ -- a - - Vacuuming is the process of merging sibling cells with no descendants. Cells - with descendants can't be merged. - - Each time we arrive at the end of a horizontal branch (sibling jump index is - 0), we walk back to the nearest previous node with descendants, and repeat - the process. Since there is no index information on where to come back, a - stack is used to remember cells with descendants (descendant jump index is - non zero) encountered on the way - - After vacuuming, each cell is 4+n entry-long: - - Jump index to descendant (if any) - - Jump index to sibling (if any) - - character code - - length of merged character code(s) - - Cases, after vacuuming: - - abc.com, abc.org: 2 cells - * - [abc.co]m - [abc.or]g - - abc.com, xyz.com: 3 cells - * - [ab]c -- [.co]m - [xy]z - - ab.com, b.com: 3 cells - * - a -- [b.co]m - _ - - b.com, ab.com: 3 cells - * - _ -- [b.co]m - a - - It's possible for a character zero cell to have descendants. - - It's not possible for a character zero cell to have next siblings. - - This will have to be taken into account during both vacuuming and matching. - - Character zero cells with no descendant are discarded during vacuuming. - Character zero cells with a descendant, or character zero cells which are a - decendant are kept into the vacuumed trie. - - A vacuumed trie is very efficient memory- and lookup-wise, but is also - read-only: no string can be added or removed. The read-only trie is really - just a self-sufficient array of integers, and can easily be exported/imported - as a JSON array. It is theoretically possible to "decompile" a trie (vacuumed - or not) into the set of strings originally added to it (in the order they - were added with the current implementation), but so far I do not need this - feature. - - TODO: It's possible to build the vacuumed trie on the fly as items are - added to it. I need to carefully list all possible cases which can arise - at insertion time. The benefits will be: faster creation time (expected), no - longer read-only trie (items can be added at any time). - -*/ - -HNTrieBuilder.prototype.vacuum = function() { - if ( this.bufsz === 0 ) { return null; } - var input = this.buf, - output = [], outsz = 0, - forks = [], - iin = 0, iout; - for (;;) { - iout = outsz; - output[iout+0] = 0; - output[iout+1] = 0; - output[iout+2] = input[iin+2]; // first character - output[iout+3] = 0; - outsz += 4; - if ( input[iin] !== 0 ) { // cell with descendant - forks.push(iout, iin); // defer processing - } - for (;;) { // merge sibling cell(s) - iin = input[iin+1]; // sibling cell - if ( iin === 0 ) { break; } // no more sibling cell - if ( input[iin] !== 0 ) { break; } // cell with a descendant - if ( input[iin+2] === 0 ) { break; } // don't merge \x00 - output[outsz] = input[iin+2]; // add character data - outsz += 1; - } - if ( outsz !== iout + 4 ) { // cells were merged - output[iout+3] = outsz - iout - 4; // so adjust count - } - if ( iin !== 0 && input[iin] !== 0 ) { // can't merge this cell - output[iout+1] = outsz; - continue; - } - if ( forks.length === 0 ) { break; } // no more descendants: bye - iin = forks.pop(); // process next descendant - iout = forks.pop(); - iin = input[iin]; - output[iout] = outsz; + // Soft-dependency on vAPI so that the code here can be used outside of + // uBO (i.e. tests, benchmarks) + if ( + typeof vAPI === 'object' && + vAPI.webextFlavor.soup.has('firefox') === false + ) { + return; } - var trie; // pick optimal read-only - if ( outsz < 256 ) { // container array. - trie = new this.HNTrie8(output, outsz); - } else if ( outsz < 65536 ) { - trie = new this.HNTrie16(output, outsz); - } else { - trie = new this.HNTrie32(output, outsz); - } - this.reset(); // free working array - return trie; -}; -/******************************************************************************* + // The wasm module will work only if CPU is natively little-endian, + // as we use native uint32 array in our trie-creation js code. + const uint32s = new Uint32Array(1); + const uint8s = new Uint8Array(uint32s.buffer); + uint32s[0] = 1; + if ( uint8s[0] !== 1 ) { return; } - The following internal classes are the actual output of the vacuum() method. - - They use the minimal amount of data to be able to efficiently lookup strings - in a read-only trie. - - Given that javascript optimizers mind that the type of an argument passed to - a function always stays the same each time the function is called, there need - to be three separate implementation of matches() to allow the javascript - optimizer to do its job. - - The matching code deals only with looking up values in a TypedArray (beside - calls to String.charCodeAt), so I expect this to be fast and good candidate - for optimization by javascript engines. - -*/ - -HNTrieBuilder.prototype.HNTrie8 = function(buf, bufsz) { - this.buf = new Uint8Array(buf.slice(0, bufsz)); -}; - -HNTrieBuilder.prototype.HNTrie8.prototype.matches = function(needle) { - var ichar = needle.length, - i = 0, c1, c2, ccnt, ic, i1, i2; - for (;;) { - ichar -= 1; - c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar); - while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character - if ( c2 === 0 && c1 === 0x2E ) { return true; } - i = this.buf[i]; // next descendant - if ( i === 0 ) { return false; } // no more descendants - } - if ( c1 === 0 ) { return true; } - ccnt = this.buf[i+3]; - if ( ccnt !== 0 ) { // cell is only one character - if ( ccnt > ichar ) { return false; } - ic = ccnt; i1 = ichar-1; i2 = i+4; - while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] ); - if ( ic !== -1 ) { return false; } - ichar -= ccnt; - } - i = this.buf[i+1]; // next sibling - if ( i === 0 ) { - return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E; - } - } -}; - -HNTrieBuilder.prototype.HNTrie16 = function(buf, bufsz) { - this.buf = new Uint16Array(buf.slice(0, bufsz)); -}; - -HNTrieBuilder.prototype.HNTrie16.prototype.matches = function(needle) { - var ichar = needle.length, - i = 0, c1, c2, ccnt, ic, i1, i2; - for (;;) { - ichar -= 1; - c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar); - while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character - if ( c2 === 0 && c1 === 0x2E ) { return true; } - i = this.buf[i]; // next descendant - if ( i === 0 ) { return false; } // no more descendants - } - if ( c1 === 0 ) { return true; } - ccnt = this.buf[i+3]; - if ( ccnt !== 0 ) { // cell is only one character - if ( ccnt > ichar ) { return false; } - ic = ccnt; i1 = ichar-1; i2 = i+4; - while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] ); - if ( ic !== -1 ) { return false; } - ichar -= ccnt; - } - i = this.buf[i+1]; // next sibling - if ( i === 0 ) { - return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E; - } - } -}; - -HNTrieBuilder.prototype.HNTrie32 = function(buf, bufsz) { - this.buf = new Uint32Array(buf.slice(0, bufsz)); -}; - -HNTrieBuilder.prototype.HNTrie32.prototype.matches = function(needle) { - var ichar = needle.length, - i = 0, c1, c2, ccnt, ic, i1, i2; - for (;;) { - ichar -= 1; - c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar); - while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character - if ( c2 === 0 && c1 === 0x2E ) { return true; } - i = this.buf[i]; // next descendant - if ( i === 0 ) { return false; } // no more descendants - } - if ( c1 === 0 ) { return true; } - ccnt = this.buf[i+3]; - if ( ccnt !== 0 ) { // cell is only one character - if ( ccnt > ichar ) { return false; } - ic = ccnt; i1 = ichar-1; i2 = i+4; - while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] ); - if ( ic !== -1 ) { return false; } - ichar -= ccnt; - } - i = this.buf[i+1]; // next sibling - if ( i === 0 ) { - return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E; - } - } -}; - -/******************************************************************************* - - Experimenting: WebAssembly version. - Developed using this simple online tool: https://wasdk.github.io/WasmFiddle/ - - >>> start of C code - unsigned short buffer[0]; - int matches(int id, int cclen) + let workingDir; { - unsigned short* cc0 = &buffer[0]; - unsigned short* cc = cc0 + cclen; - unsigned short* cell0 = &buffer[512+id]; - unsigned short* cell = cell0; - unsigned short* ww; - int c1, c2, ccnt; - for (;;) { - c1 = cc <= cc0 ? 0 : *--cc; - for (;;) { - c2 = cell[2]; - if ( c2 == c1 ) { break; } - if ( c2 == 0 && c1 == 0x2E ) { return 1; } - if ( cell[0] == 0 ) { return 0; } - cell = cell0 + cell[0]; - } - if ( c1 == 0 ) { return 1; } - ccnt = cell[3]; - if ( ccnt != 0 ) { - if ( cc - ccnt < cc0 ) { return 0; } - ww = cell + 4; - while ( ccnt-- ) { - if ( *--cc != *ww++ ) { return 0; } - } - } - if ( cell[1] == 0 ) { - if ( cc == cc0 ) { return 1; } - if ( *--cc == 0x2E ) { return 1; } - return 0; - } - cell = cell0 + cell[1]; - } + const url = document.currentScript.src; + const match = /[^\/]+$/.exec(url); + workingDir = match !== null + ? url.slice(0, match.index) + : ''; } - int getLinearMemoryOffset() { - return (int)&buffer[0]; - } - <<< end of C code - Observations: - - When growing memory, we must re-create the typed array js-side. The content - of the array is preserved by grow(). - - It's slower than the javascript version... Possible explanations: - - Call overhead: https://github.com/WebAssembly/design/issues/1120 - - Having to copy whole input string in buffer before call. + const memory = new WebAssembly.Memory({ initial: 1 }); -var HNTrie16wasm = (function() { - var module; - var instance; - var memory; - var memoryOrigin = 0; - var memoryUsed = 1024; - var cbuffer; - var tbuffer; - var tbufferSize = 0; - var matchesFn; - - var init = function() { - module = new WebAssembly.Module(new Uint8Array([0,97,115,109,1,0,0,0,1,139,128,128,128,0,2,96,2,127,127,1,127,96,0,1,127,3,131,128,128,128,0,2,0,1,4,132,128,128,128,0,1,112,0,0,5,131,128,128,128,0,1,0,1,6,129,128,128,128,0,0,7,172,128,128,128,0,3,6,109,101,109,111,114,121,2,0,7,109,97,116,99,104,101,115,0,0,21,103,101,116,76,105,110,101,97,114,77,101,109,111,114,121,79,102,102,115,101,116,0,1,10,217,130,128,128,0,2,202,130,128,128,0,1,5,127,32,1,65,1,116,65,12,106,33,3,32,0,65,1,116,65,140,8,106,34,2,33,0,2,64,2,64,2,64,2,64,2,64,2,64,3,64,65,0,33,5,2,64,32,3,65,12,77,13,0,32,3,65,126,106,34,3,47,1,0,33,5,11,2,64,32,5,32,0,47,1,4,34,1,70,13,0,2,64,32,5,65,46,71,13,0,3,64,32,1,65,255,255,3,113,69,13,5,32,0,47,1,0,34,1,69,13,6,32,2,32,1,65,1,116,106,34,0,47,1,4,34,1,65,46,71,13,0,12,2,11,11,3,64,32,0,47,1,0,34,1,69,13,3,32,5,32,2,32,1,65,1,116,106,34,0,47,1,4,71,13,0,11,11,65,1,33,6,32,5,69,13,5,2,64,2,64,32,0,47,1,6,34,1,69,13,0,32,3,32,1,65,1,116,107,65,12,73,13,8,32,1,65,127,115,33,5,32,0,65,8,106,33,1,3,64,32,5,65,1,106,34,5,69,13,1,32,1,47,1,0,33,4,32,1,65,2,106,33,1,32,4,32,3,65,126,106,34,3,47,1,0,70,13,0,12,2,11,11,32,0,47,1,2,34,1,69,13,5,32,2,32,1,65,1,116,106,33,0,12,1,11,11,65,0,15,11,65,0,15,11,65,1,15,11,65,0,15,11,32,3,65,12,70,13,0,32,3,65,126,106,47,1,0,65,46,70,33,6,11,32,6,15,11,65,0,11,132,128,128,128,0,0,65,12,11])); - instance = new WebAssembly.Instance(module); - memory = instance.exports.memory; - memoryOrigin = instance.exports.getLinearMemoryOffset(); - cbuffer = new Uint16Array(memory.buffer, memoryOrigin, 512); - tbuffer = new Uint16Array(memory.buffer, memoryOrigin + 1024); - memoryUsed = memoryOrigin + 1024; - matchesFn = instance.exports.matches; - }; - - return { - create: function(data) { - if ( module === undefined ) { init(); } - var bytesNeeded = memoryUsed + ((data.length * 2 + 3) & ~3); - if ( bytesNeeded > memory.buffer.byteLength ) { - memory.grow((bytesNeeded - memory.buffer.byteLength + 65535) >>> 16); - cbuffer = new Uint16Array(memory.buffer, memoryOrigin, 512); - tbuffer = new Uint16Array(memory.buffer, memoryOrigin + 1024); - } - for ( var i = 0, j = tbufferSize; i < data.length; i++, j++ ) { - tbuffer[j] = data[i]; - } - var id = tbufferSize; - tbufferSize += data.length; - if ( tbufferSize & 1 ) { tbufferSize += 1; } - memoryUsed += tbufferSize * 2; - return id; - }, - reset: function() { - module = undefined; - instance = undefined; - memory = undefined; - memory.grow(1); - memoryUsed = 1024; - cbuffer = undefined; - tbuffer = undefined; - tbufferSize = 0; - }, - matches: function(id, hn) { - var len = hn.length; - if ( len > 512 ) { - hn = hn.slice(-512); - var pos = hn.indexOf('.'); - if ( pos !== 0 ) { - hn = hn.slice(pos + 1); - } - len = hn.length; - } - var needle = cbuffer, i = len; - while ( i-- ) { - needle[i] = hn.charCodeAt(i); - } - return matchesFn(id, len) === 1; + hnTrieManager.wasmLoading = WebAssembly.instantiateStreaming( + fetch(workingDir + 'wasm/hntrie.wasm', { mode: 'same-origin' }), + { imports: { memory } } + ).then(result => { + hnTrieManager.wasmLoading = null; + if ( !result || !result.instance ) { return; } + const pageCount = hnTrieManager.trie.byteLength >>> 16; + if ( pageCount > 1 ) { + memory.grow(pageCount - 1); } - }; + const trie = new Uint8Array(memory.buffer); + trie.set(hnTrieManager.trie); + hnTrieManager.trie = trie; + if ( hnTrieManager.trie32 !== null ) { + hnTrieManager.trie32 = new Uint32Array(memory.buffer); + } + hnTrieManager.wasmMemory = memory; + hnTrieManager.matchesWASM = result.instance.exports.matches; + hnTrieManager.matches = hnTrieManager.matchesWASM; + }).catch(reason => { + hnTrieManager.wasmLoading = null; + console.error(reason); + }); })(); -*/ + +/******************************************************************************/ + +const HNTrieRef = function(offset) { + this.id = hnTrieManager.id; + this.offset = offset; +}; + +HNTrieRef.prototype = { + isValid: function() { + return this.id === hnTrieManager.id; + }, + matches: function(needle) { + return hnTrieManager.setNeedle(needle).matches(this.offset); + }, + matchesJS: function(needle) { + return hnTrieManager.setNeedle(needle).matchesJS(this.offset); + }, + matchesWASM: function(needle) { + return hnTrieManager.setNeedle(needle).matchesWASM(this.offset); + }, +}; diff --git a/src/js/start.js b/src/js/start.js index bad7435c2..28f6cf2e4 100644 --- a/src/js/start.js +++ b/src/js/start.js @@ -29,7 +29,7 @@ /******************************************************************************/ -var µb = µBlock; +const µb = µBlock; /******************************************************************************/ @@ -287,7 +287,12 @@ var onFirstFetchReady = function(fetched) { onVersionReady(fetched.version); onCommandShortcutsReady(fetched.commandShortcuts); - µb.loadPublicSuffixList(onPSLReady); + Promise.all([ + µb.loadPublicSuffixList(), + µb.staticNetFilteringEngine.readyToUse() + ]).then(( ) => { + onPSLReady(); + }); µb.loadRedirectResources(); }; diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index cdf54d2d4..8f2ae69b5 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -20,7 +20,7 @@ */ /* jshint bitwise: false */ -/* global punycode, HNTrieBuilder */ +/* global punycode, hnTrieManager */ 'use strict'; @@ -30,7 +30,7 @@ /******************************************************************************/ -var µb = µBlock; +const µb = µBlock; // fedcba9876543210 // | | ||| @@ -43,15 +43,15 @@ var µb = µBlock; // | +-------- bit 4- 8: type [0 - 31] // +------------- bit 9-15: unused -var BlockAction = 0 << 0; -var AllowAction = 1 << 0; -var Important = 1 << 1; -var AnyParty = 0 << 2; -var FirstParty = 1 << 2; -var ThirdParty = 2 << 2; +const BlockAction = 0 << 0; +const AllowAction = 1 << 0; +const Important = 1 << 1; +const AnyParty = 0 << 2; +const FirstParty = 1 << 2; +const ThirdParty = 2 << 2; -var AnyType = 0 << 4; -var typeNameToTypeValue = { +const AnyType = 0 << 4; +const typeNameToTypeValue = { 'no_type': 0 << 4, 'stylesheet': 1 << 4, 'image': 2 << 4, @@ -75,9 +75,9 @@ var typeNameToTypeValue = { 'webrtc': 19 << 4, 'unsupported': 20 << 4 }; -var otherTypeBitValue = typeNameToTypeValue.other; +const otherTypeBitValue = typeNameToTypeValue.other; -var typeValueToTypeName = { +const typeValueToTypeName = { 1: 'stylesheet', 2: 'image', 3: 'object', @@ -100,16 +100,16 @@ var typeValueToTypeName = { 20: 'unsupported' }; -var BlockAnyTypeAnyParty = BlockAction | AnyType | AnyParty; -var BlockAnyType = BlockAction | AnyType; -var BlockAnyParty = BlockAction | AnyParty; +const BlockAnyTypeAnyParty = BlockAction | AnyType | AnyParty; +const BlockAnyType = BlockAction | AnyType; +const BlockAnyParty = BlockAction | AnyParty; -var AllowAnyTypeAnyParty = AllowAction | AnyType | AnyParty; -var AllowAnyType = AllowAction | AnyType; -var AllowAnyParty = AllowAction | AnyParty; +const AllowAnyTypeAnyParty = AllowAction | AnyType | AnyParty; +const AllowAnyType = AllowAction | AnyType; +const AllowAnyParty = AllowAction | AnyParty; -var genericHideException = AllowAction | AnyParty | typeNameToTypeValue.generichide, - genericHideImportant = BlockAction | AnyParty | typeNameToTypeValue.generichide | Important; +const genericHideException = AllowAction | AnyParty | typeNameToTypeValue.generichide, + genericHideImportant = BlockAction | AnyParty | typeNameToTypeValue.generichide | Important; // ABP filters: https://adblockplus.org/en/filters // regex tester: http://regex101.com/ @@ -119,7 +119,7 @@ var genericHideException = AllowAction | AnyParty | typeNameToTypeValue.generich // See the following as short-lived registers, used during evaluation. They are // valid until the next evaluation. -var pageHostnameRegister = '', +let pageHostnameRegister = '', requestHostnameRegister = ''; //var filterRegister = null; //var categoryRegister = ''; @@ -127,13 +127,13 @@ var pageHostnameRegister = '', // Local helpers // Be sure to not confuse 'example.com' with 'anotherexample.com' -var isFirstParty = function(domain, hostname) { +const isFirstParty = function(domain, hostname) { return hostname.endsWith(domain) && (hostname.length === domain.length || hostname.charCodeAt(hostname.length - domain.length - 1) === 0x2E /* '.' */); }; -var normalizeRegexSource = function(s) { +const normalizeRegexSource = function(s) { try { var re = new RegExp(s); return re.source; @@ -143,12 +143,12 @@ var normalizeRegexSource = function(s) { return ''; }; -var rawToRegexStr = function(s, anchor) { - var me = rawToRegexStr; +const rawToRegexStr = function(s, anchor) { + let me = rawToRegexStr; // https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/ // https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions // Also: remove leading/trailing wildcards -- there is no point. - var reStr = s.replace(me.escape1, '\\$&') + let reStr = s.replace(me.escape1, '\\$&') .replace(me.escape2, '(?:[^%.0-9a-z_-]|$)') .replace(me.escape3, '') .replace(me.escape4, '[^ ]*?'); @@ -175,7 +175,7 @@ rawToRegexStr.reTextHostnameAnchor2 = '^[a-z-]+://(?:[^/?#]+)?'; const filterDataSerialize = µb.CompiledLineIO.serialize; -var toLogDataInternal = function(categoryBits, tokenHash, filter) { +const toLogDataInternal = function(categoryBits, tokenHash, filter) { if ( filter === null ) { return undefined; } let logData = filter.logData(); logData.compiled = filterDataSerialize([ @@ -209,7 +209,7 @@ var toLogDataInternal = function(categoryBits, tokenHash, filter) { }; // First character of match must be within the hostname part of the url. -var isHnAnchored = function(url, matchStart) { +const isHnAnchored = function(url, matchStart) { var hnStart = url.indexOf('://'); if ( hnStart === -1 ) { return false; } hnStart += 3; @@ -222,9 +222,9 @@ var isHnAnchored = function(url, matchStart) { return url.charCodeAt(matchStart - 1) === 0x2E; }; -var reURLPostHostnameAnchors = /[\/?#]/; +const reURLPostHostnameAnchors = /[\/?#]/; -var arrayStrictEquals = function(a, b) { +const arrayStrictEquals = function(a, b) { var n = a.length; if ( n !== b.length ) { return false; } var isArray, x, y; @@ -251,22 +251,22 @@ var arrayStrictEquals = function(a, b) { **/ -var filterClasses = [], - filterClassIdGenerator = 0; +const filterClasses = []; +let filterClassIdGenerator = 0; -var registerFilterClass = function(ctor) { - var fid = filterClassIdGenerator++; +const registerFilterClass = function(ctor) { + let fid = filterClassIdGenerator++; ctor.fid = ctor.prototype.fid = fid; filterClasses[fid] = ctor; }; -var filterFromCompiledData = function(args) { +const filterFromCompiledData = function(args) { return filterClasses[args[0]].load(args); }; /******************************************************************************/ -var FilterTrue = function() { +const FilterTrue = function() { }; FilterTrue.prototype.match = function() { @@ -297,7 +297,7 @@ registerFilterClass(FilterTrue); /******************************************************************************/ -var FilterPlain = function(s, tokenBeg) { +const FilterPlain = function(s, tokenBeg) { this.s = s; this.tokenBeg = tokenBeg; }; @@ -330,7 +330,7 @@ registerFilterClass(FilterPlain); /******************************************************************************/ -var FilterPlainPrefix0 = function(s) { +const FilterPlainPrefix0 = function(s) { this.s = s; }; @@ -362,7 +362,7 @@ registerFilterClass(FilterPlainPrefix0); /******************************************************************************/ -var FilterPlainPrefix1 = function(s) { +const FilterPlainPrefix1 = function(s) { this.s = s; }; @@ -394,7 +394,7 @@ registerFilterClass(FilterPlainPrefix1); /******************************************************************************/ -var FilterPlainHostname = function(s) { +const FilterPlainHostname = function(s) { this.s = s; }; @@ -429,7 +429,7 @@ registerFilterClass(FilterPlainHostname); /******************************************************************************/ -var FilterPlainLeftAnchored = function(s) { +const FilterPlainLeftAnchored = function(s) { this.s = s; }; @@ -461,7 +461,7 @@ registerFilterClass(FilterPlainLeftAnchored); /******************************************************************************/ -var FilterPlainRightAnchored = function(s) { +const FilterPlainRightAnchored = function(s) { this.s = s; }; @@ -493,7 +493,7 @@ registerFilterClass(FilterPlainRightAnchored); /******************************************************************************/ -var FilterExactMatch = function(s) { +const FilterExactMatch = function(s) { this.s = s; }; @@ -525,7 +525,7 @@ registerFilterClass(FilterExactMatch); /******************************************************************************/ -var FilterPlainHnAnchored = function(s) { +const FilterPlainHnAnchored = function(s) { this.s = s; }; @@ -558,7 +558,7 @@ registerFilterClass(FilterPlainHnAnchored); /******************************************************************************/ -var FilterGeneric = function(s, anchor) { +const FilterGeneric = function(s, anchor) { this.s = s; this.anchor = anchor; }; @@ -603,7 +603,7 @@ registerFilterClass(FilterGeneric); /******************************************************************************/ -var FilterGenericHnAnchored = function(s) { +const FilterGenericHnAnchored = function(s) { this.s = s; }; @@ -642,7 +642,7 @@ registerFilterClass(FilterGenericHnAnchored); /******************************************************************************/ -var FilterGenericHnAndRightAnchored = function(s) { +const FilterGenericHnAndRightAnchored = function(s) { FilterGenericHnAnchored.call(this, s); }; @@ -682,7 +682,7 @@ registerFilterClass(FilterGenericHnAndRightAnchored); /******************************************************************************/ -var FilterRegex = function(s) { +const FilterRegex = function(s) { this.re = s; }; @@ -723,7 +723,7 @@ registerFilterClass(FilterRegex); // Filtering according to the origin. -var FilterOrigin = function() { +const FilterOrigin = function() { }; FilterOrigin.prototype.wrapped = { @@ -766,7 +766,7 @@ FilterOrigin.prototype.compile = function() { // *** start of specialized origin matchers -var FilterOriginHit = function(domainOpt) { +const FilterOriginHit = function(domainOpt) { FilterOrigin.call(this); this.hostname = domainOpt; }; @@ -792,7 +792,7 @@ FilterOriginHit.prototype = Object.create(FilterOrigin.prototype, { // -var FilterOriginMiss = function(domainOpt) { +const FilterOriginMiss = function(domainOpt) { FilterOrigin.call(this); this.hostname = domainOpt.slice(1); }; @@ -811,14 +811,15 @@ FilterOriginMiss.prototype = Object.create(FilterOrigin.prototype, { var needle = this.hostname, haystack = pageHostnameRegister; if ( haystack.endsWith(needle) === false ) { return true; } var offset = haystack.length - needle.length; - return offset !== 0 && haystack.charCodeAt(offset - 1) !== 0x2E /* '.' */; + return offset !== 0 && + haystack.charCodeAt(offset - 1) !== 0x2E /* '.' */; } }, }); // -var FilterOriginHitSet = function(domainOpt) { +const FilterOriginHitSet = function(domainOpt) { FilterOrigin.call(this); this.domainOpt = domainOpt.length < 128 ? domainOpt @@ -840,17 +841,17 @@ FilterOriginHitSet.prototype = Object.create(FilterOrigin.prototype, { }, matchOrigin: { value: function() { - if ( this.oneOf === null ) { - this.oneOf = HNTrieBuilder.fromDomainOpt(this.domainOpt); + if ( hnTrieManager.isValidRef(this.oneOf) === false ) { + this.oneOf = hnTrieManager.fromDomainOpt(this.domainOpt); } - return this.oneOf.matches(pageHostnameRegister); + return this.oneOf.matches(pageHostnameRegister) === 1; } }, }); // -var FilterOriginMissSet = function(domainOpt) { +const FilterOriginMissSet = function(domainOpt) { FilterOrigin.call(this); this.domainOpt = domainOpt.length < 128 ? domainOpt @@ -872,17 +873,19 @@ FilterOriginMissSet.prototype = Object.create(FilterOrigin.prototype, { }, matchOrigin: { value: function() { - if ( this.noneOf === null ) { - this.noneOf = HNTrieBuilder.fromDomainOpt(this.domainOpt.replace(/~/g, '')); + if ( hnTrieManager.isValidRef(this.noneOf) === false ) { + this.noneOf = hnTrieManager.fromDomainOpt( + this.domainOpt.replace(/~/g, '') + ); } - return this.noneOf.matches(pageHostnameRegister) === false; + return this.noneOf.matches(pageHostnameRegister) === 0; } }, }); // -var FilterOriginMixedSet = function(domainOpt) { +const FilterOriginMixedSet = function(domainOpt) { FilterOrigin.call(this); this.domainOpt = domainOpt.length < 128 ? domainOpt @@ -903,20 +906,16 @@ FilterOriginMixedSet.prototype = Object.create(FilterOrigin.prototype, { }, init: { value: function() { - var oneOf = [], noneOf = [], - hostnames = this.domainOpt.split('|'), - i = hostnames.length, - hostname; - while ( i-- ) { - hostname = hostnames[i]; + let oneOf = [], noneOf = []; + for ( let hostname of this.domainOpt.split('|') ) { if ( hostname.charCodeAt(0) === 0x7E /* '~' */ ) { noneOf.push(hostname.slice(1)); } else { oneOf.push(hostname); } } - this.oneOf = HNTrieBuilder.fromIterable(oneOf); - this.noneOf = HNTrieBuilder.fromIterable(noneOf); + this.oneOf = hnTrieManager.fromIterable(oneOf); + this.noneOf = hnTrieManager.fromIterable(noneOf); } }, toDomainOpt: { @@ -926,10 +925,12 @@ FilterOriginMixedSet.prototype = Object.create(FilterOrigin.prototype, { }, matchOrigin: { value: function() { - if ( this.oneOf === null ) { this.init(); } - var needle = pageHostnameRegister; - return this.oneOf.matches(needle) && - this.noneOf.matches(needle) === false; + if ( hnTrieManager.isValidRef(this.oneOf) === false ) { + this.init(); + } + let needle = pageHostnameRegister; + return this.oneOf.matches(needle) === 1 && + this.noneOf.matches(needle) === 0; } }, }); @@ -981,7 +982,7 @@ registerFilterClass(FilterOrigin); /******************************************************************************/ -var FilterDataHolder = function(dataType, dataStr) { +const FilterDataHolder = function(dataType, dataStr) { this.dataType = dataType; this.dataStr = dataStr; this.wrapped = undefined; @@ -1024,7 +1025,7 @@ registerFilterClass(FilterDataHolder); // Helper class for storing instances of FilterDataHolder. -var FilterDataHolderEntry = function(categoryBits, tokenHash, fdata) { +const FilterDataHolderEntry = function(categoryBits, tokenHash, fdata) { this.categoryBits = categoryBits; this.tokenHash = tokenHash; this.filter = filterFromCompiledData(fdata); @@ -1047,7 +1048,7 @@ FilterDataHolderEntry.load = function(data) { // Dictionary of hostnames // -var FilterHostnameDict = function() { +const FilterHostnameDict = function() { this.h = ''; // short-lived register this.dict = new Set(); }; @@ -1138,7 +1139,7 @@ registerFilterClass(FilterHostnameDict); /******************************************************************************/ -var FilterPair = function(a, b) { +const FilterPair = function(a, b) { this.f1 = a; this.f2 = b; this.f = null; @@ -1217,7 +1218,7 @@ registerFilterClass(FilterPair); /******************************************************************************/ -var FilterBucket = function(a, b, c) { +const FilterBucket = function(a, b, c) { this.filters = []; this.f = null; if ( a !== undefined ) { @@ -1315,7 +1316,7 @@ registerFilterClass(FilterBucket); /******************************************************************************/ /******************************************************************************/ -var FilterParser = function() { +const FilterParser = function() { this.cantWebsocket = vAPI.cantWebsocket; this.reBadDomainOptChars = /[*+?^${}()[\]\\]/; this.reHostnameRule1 = /^[0-9a-z][0-9a-z.-]*[0-9a-z]$/i; @@ -1933,7 +1934,7 @@ FilterParser.prototype.makeToken = function() { /******************************************************************************/ /******************************************************************************/ -var FilterContainer = function() { +const FilterContainer = function() { this.reIsGeneric = /[\^\*]/; this.filterParser = new FilterParser(); this.urlTokenizer = µb.urlTokenizer; @@ -1960,6 +1961,9 @@ FilterContainer.prototype.reset = function() { this.dataFilters = new Map(); this.filterParser.reset(); + // This will invalidate all hn tries throughout uBO: + hnTrieManager.reset(); + // Runtime registers this.cbRegister = undefined; this.thRegister = undefined; @@ -2052,6 +2056,15 @@ FilterContainer.prototype.freeze = function() { /******************************************************************************/ +// This is necessary for when the filtering engine readiness will depend +// on asynchronous operations (ex.: when loading a wasm module). + +FilterContainer.prototype.readyToUse = function() { + return hnTrieManager.readyToUse(); +}; + +/******************************************************************************/ + FilterContainer.prototype.toSelfie = function() { let categoriesToSelfie = function(categoryMap) { let selfie = []; @@ -2250,7 +2263,7 @@ FilterContainer.prototype.compileToAtomicFilter = function( // Only static filter with an explicit type can be redirected. If we reach // this point, it's because there is one or more explicit type. - if ( parsed.badFilter === false && parsed.redirect ) { + if ( parsed.redirect ) { let redirects = µb.redirectEngine.compileRuleFromStaticFilter(parsed.raw); if ( Array.isArray(redirects) ) { for ( let redirect of redirects ) { @@ -2292,26 +2305,24 @@ FilterContainer.prototype.fromCompiledContent = function(reader) { FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out, outlog) { if ( this.dataFilters.length === 0 ) { return; } - var url = this.urlTokenizer.setURL(requestURL); + let url = this.urlTokenizer.setURL(requestURL); - requestHostnameRegister = µb.URI.hostnameFromURI(url); + pageHostnameRegister = requestHostnameRegister = µb.URI.hostnameFromURI(url); // We need to visit ALL the matching filters. - var toAddImportant = new Map(), + let toAddImportant = new Map(), toAdd = new Map(), toRemove = new Map(); - var entry, f, - tokenHashes = this.urlTokenizer.getTokens(), - tokenHash, tokenOffset, + let tokenHashes = this.urlTokenizer.getTokens(), i = 0; while ( i < 32 ) { - tokenHash = tokenHashes[i++]; + let tokenHash = tokenHashes[i++]; if ( tokenHash === 0 ) { break; } - tokenOffset = tokenHashes[i++]; - entry = this.dataFilters.get(tokenHash); + let tokenOffset = tokenHashes[i++]; + let entry = this.dataFilters.get(tokenHash); while ( entry !== undefined ) { - f = entry.filter; + let f = entry.filter; if ( f.match(url, tokenOffset) === true ) { if ( entry.categoryBits & 0x001 ) { toRemove.set(f.dataStr, entry); @@ -2324,9 +2335,9 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out entry = entry.next; } } - entry = this.dataFilters.get(this.noTokenHash); + let entry = this.dataFilters.get(this.noTokenHash); while ( entry !== undefined ) { - f = entry.filter; + let f = entry.filter; if ( f.match(url) === true ) { if ( entry.categoryBits & 0x001 ) { toRemove.set(f.dataStr, entry); @@ -2342,12 +2353,11 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out if ( toAddImportant.size === 0 && toAdd.size === 0 ) { return; } // Remove entries overriden by other filters. - var key; - for ( key of toAddImportant.keys() ) { + for ( let key of toAddImportant.keys() ) { toAdd.delete(key); toRemove.delete(key); } - for ( key of toRemove.keys() ) { + for ( let key of toRemove.keys() ) { if ( key === '' ) { toAdd.clear(); break; @@ -2355,26 +2365,25 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out toAdd.delete(key); } - var logData; - for ( entry of toAddImportant ) { + for ( let entry of toAddImportant ) { out.push(entry[0]); if ( outlog === undefined ) { continue; } - logData = entry[1].logData(); + let logData = entry[1].logData(); logData.source = 'static'; logData.result = 1; outlog.push(logData); } - for ( entry of toAdd ) { + for ( let entry of toAdd ) { out.push(entry[0]); if ( outlog === undefined ) { continue; } - logData = entry[1].logData(); + let logData = entry[1].logData(); logData.source = 'static'; logData.result = 1; outlog.push(logData); } if ( outlog !== undefined ) { - for ( entry of toRemove.values()) { - logData = entry.logData(); + for ( let entry of toRemove.values()) { + let logData = entry.logData(); logData.source = 'static'; logData.result = 2; outlog.push(logData); @@ -2389,20 +2398,19 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out FilterContainer.prototype.matchTokens = function(bucket, url) { // Hostname-only filters - var f = bucket.get(this.dotTokenHash); + let f = bucket.get(this.dotTokenHash); if ( f !== undefined && f.match() === true ) { this.thRegister = this.dotTokenHash; this.fRegister = f; return true; } - var tokenHashes = this.urlTokenizer.getTokens(), - tokenHash, tokenOffset, + let tokenHashes = this.urlTokenizer.getTokens(), i = 0; for (;;) { - tokenHash = tokenHashes[i++]; + let tokenHash = tokenHashes[i++]; if ( tokenHash === 0 ) { break; } - tokenOffset = tokenHashes[i++]; + let tokenOffset = tokenHashes[i++]; f = bucket.get(tokenHash); if ( f !== undefined && f.match(url, tokenOffset) === true ) { this.thRegister = tokenHash; @@ -2437,8 +2445,10 @@ FilterContainer.prototype.matchStringGenericHide = function(requestURL) { let url = this.urlTokenizer.setURL(requestURL); // https://github.com/gorhill/uBlock/issues/2225 - // Important: this is used by FilterHostnameDict.match(). - requestHostnameRegister = µb.URI.hostnameFromURI(url); + // Important: + // - `pageHostnameRegister` is used by FilterOrigin.matchOrigin(). + // - `requestHostnameRegister` is used by FilterHostnameDict.match(). + pageHostnameRegister = requestHostnameRegister = µb.URI.hostnameFromURI(url); let bucket = this.categories.get(genericHideException); if ( !bucket || this.matchTokens(bucket, url) === false ) { @@ -2548,7 +2558,7 @@ FilterContainer.prototype.matchString = function(context) { // https://github.com/chrisaljoudi/uBlock/issues/519 // Use exact type match for anything beyond `other` // Also, be prepared to support unknown types - var type = typeNameToTypeValue[context.requestType]; + let type = typeNameToTypeValue[context.requestType]; if ( type === undefined ) { type = otherTypeBitValue; } else if ( type === 0 || type > otherTypeBitValue ) { @@ -2577,7 +2587,7 @@ FilterContainer.prototype.matchString = function(context) { // filter. // Prime tokenizer: we get a normalized URL in return. - var url = this.urlTokenizer.setURL(context.requestURL); + let url = this.urlTokenizer.setURL(context.requestURL); // These registers will be used by various filters pageHostnameRegister = context.pageHostname || ''; @@ -2585,10 +2595,10 @@ FilterContainer.prototype.matchString = function(context) { this.fRegister = null; - var party = isFirstParty(context.pageDomain, context.requestHostname) + let party = isFirstParty(context.pageDomain, context.requestHostname) ? FirstParty : ThirdParty; - var categories = this.categories, + let categories = this.categories, catBits, bucket; // https://github.com/chrisaljoudi/uBlock/issues/139 diff --git a/src/js/storage.js b/src/js/storage.js index d88e82c69..e76578596 100644 --- a/src/js/storage.js +++ b/src/js/storage.js @@ -604,9 +604,7 @@ µBlock.loadFilterLists = function(callback) { // Callers are expected to check this first. - if ( this.loadingFilterLists ) { - return; - } + if ( this.loadingFilterLists ) { return; } this.loadingFilterLists = true; var µb = this, @@ -961,38 +959,31 @@ /******************************************************************************/ -µBlock.loadPublicSuffixList = function(callback) { - var µb = this, - assetKey = µb.pslAssetKey, - compiledAssetKey = 'compiled/' + assetKey; - - if ( typeof callback !== 'function' ) { - callback = this.noopFunc; - } - var onRawListLoaded = function(details) { - if ( details.content !== '' ) { - µb.compilePublicSuffixList(details.content); - } - callback(); - }; - - var onCompiledListLoaded = function(details) { - var selfie; +µBlock.loadPublicSuffixList = function() { + return new Promise(resolve => { + // start of executor + this.assets.get('compiled/' + this.pslAssetKey, details => { + let selfie; try { selfie = JSON.parse(details.content); } catch (ex) { } if ( - selfie === undefined || - publicSuffixList.fromSelfie(selfie) === false + selfie instanceof Object && + publicSuffixList.fromSelfie(selfie) ) { - µb.assets.get(assetKey, onRawListLoaded); + resolve(); return; } - callback(); - }; - - this.assets.get(compiledAssetKey, onCompiledListLoaded); + this.assets.get(this.pslAssetKey, details => { + if ( details.content !== '' ) { + this.compilePublicSuffixList(details.content); + } + resolve(); + }); + }); + // end of executor + }); }; /******************************************************************************/ diff --git a/src/js/wasm/README.md b/src/js/wasm/README.md new file mode 100644 index 000000000..32aef076f --- /dev/null +++ b/src/js/wasm/README.md @@ -0,0 +1,24 @@ +### For code reviewers + +All `wasm` files in that directory where created by compiling the +corresponding `wat` file using the command (using `hntrie.wat`/`hntrie.wasm` +as example): + + wat2wasm hntrie.wat -o hntrie.wasm + +Assuming: + +- The command is executed from within the present directory. + +### `wat2wasm` tool + +The `wat2wasm` tool can be downloaded from an official WebAssembly project: +. + +### `wat2wasm` tool online + +You can also use the following online `wat2wasm` tool: +. + +Just paste the whole content of the `wat` file to compile into the WAT pane. +Click "Download" button to retrieve the resulting `wasm` file. \ No newline at end of file diff --git a/src/js/wasm/hntrie.wasm b/src/js/wasm/hntrie.wasm new file mode 100644 index 0000000000000000000000000000000000000000..b4bbbc2efe3574fad4a55b6e789d8a795ee2e180 GIT binary patch literal 337 zcmX|7K~BRk5S(4x3Dlq-_<)lWm%@#6e4rrl39MqNiRu(NIlw6|MdBy0NedTSJ2N{o zT7)4H04nH}CYtz$$~L_p^EebKrepqOkYYk392(m63CVtj*zL%)yv;{(b?lYc}Wr literal 0 HcmV?d00001 diff --git a/src/js/wasm/hntrie.wat b/src/js/wasm/hntrie.wat new file mode 100644 index 000000000..38813e772 --- /dev/null +++ b/src/js/wasm/hntrie.wat @@ -0,0 +1,200 @@ +;; +;; uBlock Origin - a browser extension to block requests. +;; Copyright (C) 2018-present Raymond Hill +;; +;; This program is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with this program. If not, see {http://www.gnu.org/licenses/}. +;; +;; Home: https://github.com/gorhill/uBlock +;; File: hntrie.wat +;; Description: WebAssembly code used by src/js/hntrie.js +;; How to compile: See README.md in this directory. + +(module +;; +;; module start +;; + +;; (func $log (import "imports" "log") (param i32 i32 i32)) + +(memory (import "imports" "memory") 1) + +;; +;; Public functions +;; + +;; +;; unsigned int matches(offset) +;; +;; Test whether the currently set needle matches the trie at specified offset. +;; +;; Memory layout, byte offset: +;; 0-254: encoded needle (ASCII) +;; 255 : needle length +;; 256- : tries +;; +(func (export "matches") + (param $itrie i32) + (result i32) ;; result: 0 = miss, 1 = hit + (local $ineedle i32) ;; current needle offset + (local $nchar i32) ;; needle char being processed + (local $tchar i32) ;; trie char being processed + (local $lxtra i32) + (local $ixtra i32) + i32.const 255 + i32.load8_u + set_local $ineedle + loop $nextNeedleChar + ;; ineedle -= 1; + get_local $ineedle + i32.const -1 + i32.add + tee_local $ineedle + ;; let nchar = ineedle === -1 ? 0 : buf[ineedle]; + i32.const 0 + i32.lt_s + if + i32.const 0 + set_local $nchar + else + get_local $ineedle + i32.load8_u + set_local $nchar + end + block $trieCharEqNeedleChar loop $nextTrieChar + ;; let tchar = buf[itrie+8]; + get_local $itrie + i32.load8_u offset=8 + tee_local $tchar + ;; if ( tchar === nchar ) { break; } + get_local $nchar + i32.eq + br_if $trieCharEqNeedleChar + ;; if ( tchar === 0 && nchar === 0x2E ) { return 1; } + get_local $tchar + i32.eqz + if + get_local $nchar + i32.const 0x2E + i32.eq + if + i32.const 1 + return + end + end + ;; itrie = buf32[itrie >>> 2]; + get_local $itrie + i32.load + tee_local $itrie + ;; if ( itrie === 0 ) { return 0; } + i32.eqz + if + i32.const 0 + return + end + br $nextTrieChar + end end + ;; if ( nchar === 0 ) { return 1; } + get_local $nchar + i32.eqz + if + i32.const 1 + return + end + ;; let lxtra = buf[itrie+9]; + get_local $itrie + i32.load8_u offset=9 + tee_local $lxtra + i32.eqz + if else + ;; if ( lxtra > ineedle ) { return 0; } + get_local $lxtra + get_local $ineedle + i32.gt_u + if + i32.const 0 + return + end + ;; let ixtra = itrie + 10; + get_local $itrie + i32.const 10 + i32.add + tee_local $ixtra + ;; lxtra += ixtra; + get_local $lxtra + i32.add + set_local $lxtra + ;; do { + block $noMoreExtraChars loop + ;; ineedle -= 1; + get_local $ineedle + i32.const -1 + i32.add + tee_local $ineedle + ;; if ( buf[ineedle] !== buf[ixtra] ) { return 0; } + i32.load8_u + get_local $ixtra + i32.load8_u + i32.ne + if + i32.const 0 + return + end + ;; ixtra += 1; + get_local $ixtra + i32.const 1 + i32.add + tee_local $ixtra + ;; while ( ixtra !== lxtra ) { + get_local $lxtra + i32.eq + br_if $noMoreExtraChars + br 0 + end end + end + ;; itrie = buf32[itrie + 4 >>> 2]; + get_local $itrie + i32.load offset=4 + tee_local $itrie + ;; if ( itrie === 0 ) { + i32.eqz + if + ;; return ineedle === 0 || buf[ineedle-1] === 0x2E ? 1 : 0; + get_local $ineedle + i32.eqz + if + i32.const 1 + return + end + get_local $ineedle + i32.const -1 + i32.add + i32.load8_u + i32.const 0x2E + i32.eq + if + i32.const 1 + return + end + i32.const 0 + return + end + br 0 + end + i32.const 0 +) + +;; +;; module end +;; +) diff --git a/test/hnset-benchmark.html b/test/hnset-benchmark.html new file mode 100644 index 000000000..fccc3764e --- /dev/null +++ b/test/hnset-benchmark.html @@ -0,0 +1,479 @@ + + + + + + +

Benchmark of hostname-lookup data structures: Set, RegExp, HNTrie

+

+
+
+
+
+
+
+
+ + + + + + + + + + + diff --git a/test/hntrie-test.html b/test/hntrie-test.html new file mode 100644 index 000000000..9644b643c --- /dev/null +++ b/test/hntrie-test.html @@ -0,0 +1,45866 @@ + + + + + + +

HNTrie test

+
+
+ + + + + + +