diff --git a/src/background.html b/src/background.html index edc04ba6e..d3ccd0499 100644 --- a/src/background.html +++ b/src/background.html @@ -14,6 +14,7 @@ + diff --git a/src/js/hntrie.js b/src/js/hntrie.js new file mode 100644 index 000000000..9c19f4373 --- /dev/null +++ b/src/js/hntrie.js @@ -0,0 +1,474 @@ +/******************************************************************************* + + uBlock Origin - a browser extension to block requests. + Copyright (C) 2017 Raymond Hill + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see {http://www.gnu.org/licenses/}. + + Home: https://github.com/gorhill/uBlock +*/ + +'use strict'; + +/******************************************************************************* + + The original prototype was to develop an idea I had about using jump indices + in a TypedArray for quickly matching hostnames (or more generally strings)[1]. + Once I had a working, un-optimized prototype, I realized I had ended up + with something formally named a "trie": , + hence the name. I have no idea whether the implementation here or one + resembling it has been done elsewhere. + + "HN" in HNTrieBuilder stands for "HostName", because the trie is specialized + to deal with matching hostnames -- which is a bit more complicated than + matching plain strings. + + For example, `www.abc.com` is deemed matching `abc.com`, because the former + is a subdomain of the latter. The opposite is of course not true. + + The resulting read-only trie created as a result of using HNTrieBuilder are + simply just typed arrays filled with integers. The matching algorithm is + just a matter of reading/comparing these integers, and further using them as + indices in the array as a way to move around in the trie. + + There is still place for optimizations. Specifically, I could force the + strings to be properly sorted so that `HNTrie.matches` could bail earlier + when trying to find a matching descendant -- but suspect the gain would be + marginal, if measurable. + + [1] To solve + +*/ + +var HNTrieBuilder = function() { + this.reset(); +}; + +/******************************************************************************* + + A plain javascript array is used to build the trie. It will be casted into + the appropriate read-only TypedArray[1] at vacuum time. + + [1] Depending on the size: Uint8Array, Uint16Array, or Uint32Array. + +*/ + +HNTrieBuilder.prototype.reset = function() { + this.buf = []; + this.bufsz = 0; + this.buf[0] = 0; + this.buf[1] = 0; + this.buf[2] = 0; + return this; +}; + +/******************************************************************************* + + Helpers for convenience. + +*/ + +HNTrieBuilder.fromDomainOpt = function(domainOpt) { + var builder = new HNTrieBuilder(); + builder.fromDomainOpt(domainOpt); + return builder.vacuum(); +}; + +HNTrieBuilder.fromIterable = function(hostnames) { + var builder = new HNTrieBuilder(); + builder.fromIterable(hostnames); + return builder.vacuum(); +}; + +HNTrieBuilder.print = function(trie) { + var buf = trie.buf, + i = 0, cc = [], ic, indent = 0, + forks = []; + for (;;) { + if ( buf[i] !== 0 ) { + forks.push(i, indent); + } + if ( buf[i+2] !== 0 ) { + cc.unshift(buf[i+2]); + } + for ( ic = 0; ic < buf[i+3]; ic++ ) { + cc.unshift(buf[i+4+ic]); + } + console.log('\xB7'.repeat(indent) + String.fromCharCode.apply(null, cc)); + indent += cc.length; + cc = []; + i = buf[i+1]; + if ( i === 0 ) { + if ( forks.length === 0 ) { break; } + indent = forks.pop(); + i = forks.pop(); + i = buf[i]; + } + } +}; + +/******************************************************************************* + + Since this trie is specialized for matching hostnames, the stored strings are + reversed internally, because of hostname comparison logic: + + Correct matching : + index 0123456 + abc.com + | + www.abc.com + index 01234567890 + + Incorrect matching: + index 0123456 + abc.com + | + www.abc.com + index 01234567890 + +*/ + +HNTrieBuilder.prototype.add = function(hn) { + var ichar = hn.length - 1; + if ( ichar === -1 ) { return; } + var c = hn.charCodeAt(ichar), + i = 0, inext; + for (;;) { + if ( this.buf[i+2] !== c ) { // match not found + inext = this.buf[i]; // move to descendant + if ( inext === 0 ) { break; } // no descendant + } else { // match found + if ( c === 0 ) { return; } + inext = this.buf[i+1]; // move to sibling + ichar -= 1; + c = ichar === -1 ? 0 : hn.charCodeAt(ichar); + } + i = inext; + } + // Any new string added will always cause a new descendant to be created. + // The only time this is not the case is when trying to store a string + // which is already in the trie. + inext = this.bufsz; // new descendant cell + this.buf[i] = inext; + this.buf[inext+0] = 0; // jump index to descendant + this.buf[inext+1] = 0; // jump index to sibling + this.buf[inext+2] = c; // character code + this.bufsz += 3; + if ( c === 0 ) { return; } // character zero is always last cell + do { // new branch sprouting made from + i = inext; // all characters left to store + ichar -= 1; + c = ichar === -1 ? 0 : hn.charCodeAt(ichar); + inext = this.bufsz; + this.buf[i+1] = inext; + this.buf[inext+0] = 0; + this.buf[inext+1] = 0; + this.buf[inext+2] = c; + this.bufsz += 3; + } while ( c!== 0 ); +}; + +/******************************************************************************* + + Not using String.split('|') to avoid memory churning. + +*/ + +HNTrieBuilder.prototype.fromDomainOpt = function(hostnames) { + var len = hostnames.length, + beg = 0, end; + while ( beg < len ) { + end = hostnames.indexOf('|', beg); + if ( end === -1 ) { end = len; } + this.add(hostnames.slice(beg, end)); + beg = end + 1; + } + return this; +}; + +HNTrieBuilder.prototype.fromIterable = function(hostnames) { + for ( var hn of hostnames ) { + this.add(hn); + } + return this; +}; + +/******************************************************************************/ + +HNTrieBuilder.prototype.matches = function(needle) { + var ichar = needle.length - 1, + buf = this.buf, i = 0, c; + for (;;) { + c = ichar === -1 ? 0 : needle.charCodeAt(ichar); + while ( buf[i+2] !== c ) { + i = buf[i]; + if ( i === 0 ) { return false; } + } + if ( c === 0 ) { return true; } + i = buf[i+1]; + if ( i === 0 ) { return c === 0x2E; } + ichar -= 1; + } +}; + +/******************************************************************************* + + Before vacuuming, each cell is 3 entry-long: + - Jump index to descendant (if any) + - Jump index to sibling (if any) + - character code + + All strings stored in the un-vacuumed trie are zero-terminated, and the + character zero does occupy a cell like any other character. Let's use _ to + represent character zero for sake of comments. The asterisk will be used to + highlight a node with a descendant. + + Cases, before vacuuming: + + abc.com, abc.org: + * + _ -- a -- b -- c -- . -- c -- o -- m + _ -- a -- b -- c -- . -- o -- r -- g + + abc.com, xyz.com: + * + _ -- a -- b -- c -- . -- c -- o -- m + _ -- x -- y -- z + + ab.com, b.com: + * + _ -- a -- b -- . -- c -- o -- m + _ + + b.com, ab.com: + * + _ -- b -- . -- c -- o -- m + _ -- a + + Vacuuming is the process of merging sibling cells with no descendants. Cells + with descendants can't be merged. + + Each time we arrive at the end of a horizontal branch (sibling jump index is + 0), we walk back to the nearest previous node with descendants, and repeat + the process. Since there is no index information on where to come back, a + stack is used to remember cells with descendants (descendant jump index is + non zero) encountered on the way + + After vacuuming, each cell is 4+n entry-long: + - Jump index to descendant (if any) + - Jump index to sibling (if any) + - character code + - length of merged character code(s) + + Cases, after vacuuming: + + abc.com, abc.org: + * + [abc.co]m + [abc.or]g + + abc.com, xyz.com: + * + [ab]c -- [.co]m + [xy]z + + ab.com, b.com: + * + a -- [b.co]m + _ + + b.com, ab.com: + * + _ -- [b.co]m + a + + It's possible for a character zero cell to have descendants. + + It's not possible for a character zero cell to have next siblings. + + This will have to be taken into account during both vacuuming and matching. + + Character zero cells with no descendant are discarded during vacuuming. + Character zero cells with a descendant, or character zero cells which are a + decendant are kept into the vacuumed trie. + + A vacuumed trie is very efficient memory- and lookup-wise, but is also + read-only: no string can be added or removed. The read-only trie is really + just a self-sufficient array of integers, and can easily be exported/imported + as a JSON array. It is theoretically possible to "decompile" a trie (vacuumed + or not) into the set of strings originally added to it (in the order they + were added with the current implementation), but so far I do not need this + feature. + +*/ + +HNTrieBuilder.prototype.vacuum = function() { + if ( this.bufsz === 0 ) { return null; } + var input = this.buf, + output = [], outsz = 0, + forks = [], + iin = 0, iout; + for (;;) { + iout = outsz; + output[iout+0] = 0; + output[iout+1] = 0; + output[iout+2] = input[iin+2]; // first character + output[iout+3] = 0; + outsz += 4; + if ( input[iin] !== 0 ) { // cell with descendant + forks.push(iout, iin); // defer processing + } + for (;;) { // merge sibling cell(s) + iin = input[iin+1]; // sibling cell + if ( iin === 0 ) { break; } // no more sibling cell + if ( input[iin] !== 0 ) { break; } // cell with a descendant + if ( input[iin+2] === 0 ) { break; } // don't merge \x00 + output[outsz] = input[iin+2]; // add character data + outsz += 1; + } + if ( outsz !== iout + 4 ) { // cells were merged + output[iout+3] = outsz - iout - 4; // so adjust count + } + if ( iin !== 0 && input[iin] !== 0 ) { // can't merge this cell + output[iout+1] = outsz; + continue; + } + if ( forks.length === 0 ) { break; } // no more descendants: bye + iin = forks.pop(); // process next descendant + iout = forks.pop(); + iin = input[iin]; + output[iout] = outsz; + } + var trie; // pick optimal read-only + if ( outsz < 256 ) { // container array. + trie = new this.HNTrie8(output, outsz); + } else if ( outsz < 65536 ) { + trie = new this.HNTrie16(output, outsz); + } else { + trie = new this.HNTrie32(output, outsz); + } + this.reset(); // free working array + return trie; +}; + +/******************************************************************************* + + The following internal classes are the actual output of the vacuum() method. + + They use the minimal amount of data to be able to efficiently lookup strings + in a read-only trie. + + Given that javascript optimizers mind that the type of an argument passed to + a function always stays the same each time the function is called, there need + to be three separate implementation of matches() to allow the javascript + optimizer to do its job. + + The matching code deals only with looking up values in a TypedArray (beside + calls to String.charCodeAt), so I expect this to be fast and good candidate + for optimization by javascript engines. + +*/ + +HNTrieBuilder.prototype.HNTrie8 = function(buf, bufsz) { + this.buf = new Uint8Array(buf.slice(0, bufsz)); +}; + +HNTrieBuilder.prototype.HNTrie8.prototype.matches = function(needle) { + var ichar = needle.length, + i = 0, c1, c2, ccnt, ic, i1, i2; + for (;;) { + ichar -= 1; + c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar); + while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character + if ( c2 === 0 && c1 === 0x2E ) { return true; } + i = this.buf[i]; // next descendant + if ( i === 0 ) { return false; } // no more descendants + } + if ( c1 === 0 ) { return true; } + ccnt = this.buf[i+3]; + if ( ccnt > ichar ) { return false; } + if ( ccnt !== 0 ) { // cell is only one character + ic = ccnt; i1 = ichar-1; i2 = i+4; + while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] ); + if ( ic !== -1 ) { return false; } + ichar -= ccnt; + } + i = this.buf[i+1]; // next sibling + if ( i === 0 ) { + return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E; + } + } +}; + +HNTrieBuilder.prototype.HNTrie16 = function(buf, bufsz) { + this.buf = new Uint16Array(buf.slice(0, bufsz)); +}; + +HNTrieBuilder.prototype.HNTrie16.prototype.matches = function(needle) { + var ichar = needle.length, + i = 0, c1, c2, ccnt, ic, i1, i2; + for (;;) { + ichar -= 1; + c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar); + while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character + if ( c2 === 0 && c1 === 0x2E ) { return true; } + i = this.buf[i]; // next descendant + if ( i === 0 ) { return false; } // no more descendants + } + if ( c1 === 0 ) { return true; } + ccnt = this.buf[i+3]; + if ( ccnt > ichar ) { return false; } + if ( ccnt !== 0 ) { // cell is only one character + ic = ccnt; i1 = ichar-1; i2 = i+4; + while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] ); + if ( ic !== -1 ) { return false; } + ichar -= ccnt; + } + i = this.buf[i+1]; // next sibling + if ( i === 0 ) { + return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E; + } + } +}; + +HNTrieBuilder.prototype.HNTrie32 = function(buf, bufsz) { + this.buf = new Uint32Array(buf.slice(0, bufsz)); +}; + +HNTrieBuilder.prototype.HNTrie32.prototype.matches = function(needle) { + var ichar = needle.length, + i = 0, c1, c2, ccnt, ic, i1, i2; + for (;;) { + ichar -= 1; + c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar); + while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character + if ( c2 === 0 && c1 === 0x2E ) { return true; } + i = this.buf[i]; // next descendant + if ( i === 0 ) { return false; } // no more descendants + } + if ( c1 === 0 ) { return true; } + ccnt = this.buf[i+3]; + if ( ccnt > ichar ) { return false; } + if ( ccnt !== 0 ) { // cell is only one character + ic = ccnt; i1 = ichar-1; i2 = i+4; + while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] ); + if ( ic !== -1 ) { return false; } + ichar -= ccnt; + } + i = this.buf[i+1]; // next sibling + if ( i === 0 ) { + return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E; + } + } +}; diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index aa011323c..4c157c04e 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -20,7 +20,7 @@ */ /* jshint bitwise: false */ -/* global punycode */ +/* global punycode, HNTrieBuilder */ 'use strict'; @@ -886,9 +886,9 @@ FilterOriginHitSet.prototype = Object.create(FilterOrigin.prototype, { matchOrigin: { value: function() { if ( this.oneOf === null ) { - this.oneOf = new RegExp('(?:^|\\.)(?:' + this.domainOpt.replace(/\./g, '\\.') + ')$'); + this.oneOf = HNTrieBuilder.fromDomainOpt(this.domainOpt); } - return this.oneOf.test(pageHostnameRegister); + return this.oneOf.matches(pageHostnameRegister); } }, }); @@ -918,9 +918,9 @@ FilterOriginMissSet.prototype = Object.create(FilterOrigin.prototype, { matchOrigin: { value: function() { if ( this.noneOf === null ) { - this.noneOf = new RegExp('(?:^|\\.)(?:' + this.domainOpt.replace(/~/g, '').replace(/\./g, '\\.') + ')$'); + this.noneOf = HNTrieBuilder.fromDomainOpt(this.domainOpt.replace(/~/g, '')); } - return this.noneOf.test(pageHostnameRegister) === false; + return this.noneOf.matches(pageHostnameRegister) === false; } }, }); @@ -960,8 +960,8 @@ FilterOriginMixedSet.prototype = Object.create(FilterOrigin.prototype, { oneOf.push(hostname); } } - this.oneOf = new RegExp('(?:^|\\.)(?:' + oneOf.join('|') + ')$'); - this.noneOf = new RegExp('(?:^|\\.)(?:' + noneOf.join('|') + ')$'); + this.oneOf = HNTrieBuilder.fromIterable(oneOf); + this.noneOf = HNTrieBuilder.fromIterable(noneOf); } }, toDomainOpt: { @@ -973,7 +973,8 @@ FilterOriginMixedSet.prototype = Object.create(FilterOrigin.prototype, { value: function() { if ( this.oneOf === null ) { this.init(); } var needle = pageHostnameRegister; - return this.oneOf.test(needle) && this.noneOf.test(needle) === false; + return this.oneOf.matches(needle) && + this.noneOf.matches(needle) === false; } }, });