From c6fb70b1f0acb1003b516ceb678df27f6657b4f8 Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Tue, 10 Aug 2021 09:27:59 -0400 Subject: [PATCH] Refactor hntrie to avoid the need for boundary cells Whereas before the string segment was encoded as: LL OOOOOOOOOOOO where L are the upper 8 bits and used to encode the length of the segment, and O are the lower 24 bits and used to encode the offset of the string data in the character buffer, the new code encode as follow: OOOOOOOOOOOO LL And furthermore the most significant bit of the length LL is now used to mark whether the current string segment is a label boundary. This means a cell can't reference a segment longer then 127 characters. To work around this limitation for when a segment is longer than 127 characters (a rare occurrence), the algorithm will simply split the segment into multiple adjacent cells. As a result, there is no longer a need to encode "boundariness" into special cells, which simplifies both the storing and matching algorithms. Additionally, added minimal documentation for the NPM package on how to import and use HNTrieContainer as a standalone API. --- platform/nodejs/README.md | 62 ++++++ platform/nodejs/package.json | 5 +- platform/nodejs/test.js | 70 ++++-- src/js/background.js | 2 +- src/js/hntrie.js | 114 +++++----- src/js/static-net-filtering.js | 2 +- src/js/wasm/hntrie.wasm | Bin 1025 -> 1034 bytes src/js/wasm/hntrie.wat | 374 +++++++++++++++++---------------- 8 files changed, 375 insertions(+), 254 deletions(-) diff --git a/platform/nodejs/README.md b/platform/nodejs/README.md index f1ca5b41b..41fb6adfd 100644 --- a/platform/nodejs/README.md +++ b/platform/nodejs/README.md @@ -94,3 +94,65 @@ It is possible to pre-parse filter lists and save the intermediate results for later use -- useful to speed up the loading of filter lists. This will be documented eventually, but if you feel adventurous, you can look at the code and use this capability now if you figure out the details. + +--- + +## Extras + +You can directly use specific APIs exposed by this package, here are some of +them, which are used internally by uBO's SNFE. + +### `HNTrieContainer` + +A well optimised [compressed trie](https://en.wikipedia.org/wiki/Trie#Compressing_tries) +container specialized to specifically store and lookup hostnames. + +The matching algorithm is designed for hostnames, i.e. the hostname labels +making up a hostname are matched from right to left, such that `www.example.org` +with be a match if `example.org` is stored into the trie, while +`anotherexample.org` won't be a match. + +`HNTrieContainer` is designed to store a large number of hostnames with CPU and +memory efficiency as a main concern -- and is a key component of uBO. + +To create and use a standalone `HNTrieContainer` object: + +```js +import HNTrieContainer from '@gorhill/ubo-core/js/hntrie.js'; + +const trieContainer = new HNTrieContainer(); + +const aTrie = trieContainer.createOne(); +aTrie.add('example.org'); +aTrie.add('example.com'); + +const anotherTrie = trieContainer.createOne(); +anotherTrie.add('foo.invalid'); +anotherTrie.add('bar.invalid'); + +// matches() return the position at which the match starts, or -1 when +// there is no match. + +// Matches: return 4 +console.log("aTrie.matches('www.example.org')", aTrie.matches('www.example.org')); + +// Does not match: return -1 +console.log("aTrie.matches('www.foo.invalid')", aTrie.matches('www.foo.invalid')); + +// Does not match: return -1 +console.log("anotherTrie.matches('www.example.org')", anotherTrie.matches('www.example.org')); + +// Matches: return 0 +console.log("anotherTrie.matches('foo.invalid')", anotherTrie.matches('foo.invalid')); +``` + +The `reset()` method must be used to remove all the tries from a trie container, +you can't remove a single trie from the container. + +```js +hntrieContainer.reset(); +``` + +When you reset a trie container, you can't use the reference to prior instances +of trie, i.e. `aTrie` and `anotherTrie` are no longer valid and shouldn't be +used following a reset. diff --git a/platform/nodejs/package.json b/platform/nodejs/package.json index 53dba8add..15aafac93 100644 --- a/platform/nodejs/package.json +++ b/platform/nodejs/package.json @@ -1,6 +1,6 @@ { "name": "@gorhill/ubo-core", - "version": "0.1.7", + "version": "0.1.8", "description": "To create a working instance of uBlock Origin's static network filtering engine", "type": "module", "main": "index.js", @@ -15,7 +15,8 @@ "keywords": [ "uBlock", "uBO", - "adblock" + "adblock", + "trie" ], "author": "Raymond Hill", "license": "GPL-3.0-or-later", diff --git a/platform/nodejs/test.js b/platform/nodejs/test.js index d93b6939b..040e453bb 100644 --- a/platform/nodejs/test.js +++ b/platform/nodejs/test.js @@ -33,6 +33,8 @@ import { StaticNetFilteringEngine, } from './index.js'; +import HNTrieContainer from './js/hntrie.js'; + /******************************************************************************/ function fetch(listName) { @@ -42,7 +44,7 @@ function fetch(listName) { }); } -function runTests(engine) { +function testSNFE(engine) { let result = 0; // Tests @@ -77,6 +79,53 @@ function runTests(engine) { } } +async function doSNFE() { + const engine = await StaticNetFilteringEngine.create(); + + await engine.useLists([ + fetch('easylist').then(raw => ({ name: 'easylist', raw })), + fetch('easyprivacy').then(raw => ({ name: 'easyprivacy', raw })), + ]); + + testSNFE(engine); + + const serialized = await engine.serialize(); + engine.useLists([]); + + testSNFE(engine); + + await engine.deserialize(serialized); + + testSNFE(engine); +} + +async function doHNTrie() { + const trieContainer = new HNTrieContainer(); + + const aTrie = trieContainer.createOne(); + aTrie.add('example.org'); + aTrie.add('example.com'); + + const anotherTrie = trieContainer.createOne(); + anotherTrie.add('foo.invalid'); + anotherTrie.add('bar.invalid'); + + // matches() return the position at which the match starts, or -1 when + // there is no match. + + // Matches: return 4 + console.log("aTrie.matches('www.example.org')", aTrie.matches('www.example.org')); + + // Does not match: return -1 + console.log("aTrie.matches('www.foo.invalid')", aTrie.matches('www.foo.invalid')); + + // Does not match: return -1 + console.log("anotherTrie.matches('www.example.org')", anotherTrie.matches('www.example.org')); + + // Matches: return 0 + console.log("anotherTrie.matches('foo.invalid')", anotherTrie.matches('foo.invalid')); +} + async function main() { try { const result = await enableWASM(); @@ -87,23 +136,8 @@ async function main() { console.log(ex); } - const engine = await StaticNetFilteringEngine.create(); - - await engine.useLists([ - fetch('easylist').then(raw => ({ name: 'easylist', raw })), - fetch('easyprivacy').then(raw => ({ name: 'easyprivacy', raw })), - ]); - - runTests(engine); - - const serialized = await engine.serialize(); - engine.useLists([]); - - runTests(engine); - - await engine.deserialize(serialized); - - runTests(engine); + await doSNFE(); + await doHNTrie(); process.exit(); } diff --git a/src/js/background.js b/src/js/background.js index 5c9fbe364..86e7a6351 100644 --- a/src/js/background.js +++ b/src/js/background.js @@ -155,7 +155,7 @@ const µBlock = { // jshint ignore:line // Read-only systemSettings: { compiledMagic: 37, // Increase when compiled format changes - selfieMagic: 37, // Increase when selfie format changes + selfieMagic: 38, // Increase when selfie format changes }, // https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501 diff --git a/src/js/hntrie.js b/src/js/hntrie.js index 5af943139..094bf0ea9 100644 --- a/src/js/hntrie.js +++ b/src/js/hntrie.js @@ -184,21 +184,21 @@ const HNTrieContainer = class { let ineedle = buf8[255]; let icell = buf32[iroot+0]; if ( icell === 0 ) { return -1; } + let c = 0, v = 0, i0 = 0, n = 0; for (;;) { if ( ineedle === 0 ) { return -1; } ineedle -= 1; - let c = buf8[ineedle]; - let v, i0; + c = buf8[ineedle]; // find first segment with a first-character match for (;;) { v = buf32[icell+2]; - i0 = char0 + (v & 0x00FFFFFF); + i0 = char0 + (v >>> 8); if ( buf8[i0] === c ) { break; } icell = buf32[icell+0]; if ( icell === 0 ) { return -1; } } // all characters in segment must match - let n = v >>> 24; + n = v & 0x7F; if ( n > 1 ) { n -= 1; if ( n > ineedle ) { return -1; } @@ -210,17 +210,17 @@ const HNTrieContainer = class { i0 += 1; } while ( i0 < i1 ); } + // boundary at end of segment? + if ( (v & 0x80) !== 0 ) { + if ( ineedle === 0 || buf8[ineedle-1] === 0x2E /* '.' */ ) { + return ineedle; + } + } // next segment icell = buf32[icell+1]; if ( icell === 0 ) { break; } - if ( buf32[icell+2] === 0 ) { - if ( ineedle === 0 || buf8[ineedle-1] === 0x2E ) { - return ineedle; - } - icell = buf32[icell+1]; - } } - return ineedle === 0 || buf8[ineedle-1] === 0x2E ? ineedle : -1; + return -1; } createOne(args) { @@ -256,39 +256,31 @@ const HNTrieContainer = class { let icell = this.buf32[iroot+0]; // special case: first node in trie if ( icell === 0 ) { - this.buf32[iroot+0] = this.addCell(0, 0, this.addSegment(lhnchar)); + this.buf32[iroot+0] = this.addLeafCell(lhnchar); return 1; } // const char0 = this.buf32[CHAR0_SLOT]; - let inext; + let isegchar, lsegchar, boundaryBit, inext; // find a matching cell: move down for (;;) { - const vseg = this.buf32[icell+2]; - // skip boundary cells - if ( vseg === 0 ) { - // remainder is at label boundary? if yes, no need to add - // the rest since the shortest match is always reported - if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } - icell = this.buf32[icell+1]; - continue; - } - let isegchar0 = char0 + (vseg & 0x00FFFFFF); + const v = this.buf32[icell+2]; + let isegchar0 = char0 + (v >>> 8); // if first character is no match, move to next descendant if ( this.buf[isegchar0] !== this.buf[lhnchar-1] ) { inext = this.buf32[icell+0]; if ( inext === 0 ) { - this.buf32[icell+0] = this.addCell(0, 0, this.addSegment(lhnchar)); + this.buf32[icell+0] = this.addLeafCell(lhnchar); return 1; } icell = inext; continue; } // 1st character was tested - let isegchar = 1; + isegchar = 1; lhnchar -= 1; // find 1st mismatch in rest of segment - const lsegchar = vseg >>> 24; + lsegchar = v & 0x7F; if ( lsegchar !== 1 ) { for (;;) { if ( isegchar === lsegchar ) { break; } @@ -298,49 +290,50 @@ const HNTrieContainer = class { lhnchar -= 1; } } + boundaryBit = v & 0x80; // all segment characters matched if ( isegchar === lsegchar ) { - inext = this.buf32[icell+1]; // needle remainder: no if ( lhnchar === 0 ) { - // boundary cell already present - if ( inext === 0 || this.buf32[inext+2] === 0 ) { return 0; } - // need boundary cell - this.buf32[icell+1] = this.addCell(0, inext, 0); + // boundary: yes, already present + if ( boundaryBit !== 0 ) { return 0; } + // boundary: no, mark as boundary + this.buf32[icell+2] = v | 0x80; } // needle remainder: yes else { + // remainder is at label boundary? if yes, no need to add + // the rest since the shortest match is always reported + if ( boundaryBit !== 0 ) { + if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } + } + inext = this.buf32[icell+1]; if ( inext !== 0 ) { icell = inext; continue; } - // remainder is at label boundary? if yes, no need to add - // the rest since the shortest match is always reported - if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } - // boundary cell + needle remainder - inext = this.addCell(0, 0, 0); - this.buf32[icell+1] = inext; - this.buf32[inext+1] = this.addCell(0, 0, this.addSegment(lhnchar)); + // add needle remainder + this.buf32[icell+1] = this.addLeafCell(lhnchar); } } // some segment characters matched else { // split current cell isegchar0 -= char0; - this.buf32[icell+2] = isegchar << 24 | isegchar0; + this.buf32[icell+2] = isegchar0 << 8 | isegchar; inext = this.addCell( 0, this.buf32[icell+1], - lsegchar - isegchar << 24 | isegchar0 + isegchar + isegchar0 + isegchar << 8 | boundaryBit | lsegchar - isegchar ); this.buf32[icell+1] = inext; - // needle remainder: no = need boundary cell - if ( lhnchar === 0 ) { - this.buf32[icell+1] = this.addCell(0, inext, 0); + // needle remainder: yes, need new cell for remaining characters + if ( lhnchar !== 0 ) { + this.buf32[inext+0] = this.addLeafCell(lhnchar); } - // needle remainder: yes = need new cell for remaining characters + // needle remainder: no, need boundary cell else { - this.buf32[inext+0] = this.addCell(0, 0, this.addSegment(lhnchar)); + this.buf32[icell+2] |= 0x80; } } return 1; @@ -459,9 +452,9 @@ const HNTrieContainer = class { async enableWASM(wasmModuleFetcher, path) { // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/globalThis const globals = (( ) => { + if ( typeof self !== 'undefined' ) { return self; } // jshint ignore:start if ( typeof globalThis !== 'undefined' ) { return globalThis; } - if ( typeof self !== 'undefined' ) { return self; } if ( typeof global !== 'undefined' ) { return global; } // jshint ignore:end return {}; @@ -508,16 +501,33 @@ const HNTrieContainer = class { return icell; } - addSegment(lsegchar) { + addLeafCell(lsegchar) { + const r = this.buf32[TRIE1_SLOT] >>> 2; + let i = r; + while ( lsegchar > 127 ) { + this.buf32[i+0] = 0; + this.buf32[i+1] = i + 3; + this.buf32[i+2] = this.addSegment(lsegchar, lsegchar - 127); + lsegchar -= 127; + i += 3; + } + this.buf32[i+0] = 0; + this.buf32[i+1] = 0; + this.buf32[i+2] = this.addSegment(lsegchar, 0) | 0x80; + this.buf32[TRIE1_SLOT] = i + 3 << 2; + return r; + } + + addSegment(lsegchar, lsegend) { if ( lsegchar === 0 ) { return 0; } let char1 = this.buf32[CHAR1_SLOT]; const isegchar = char1 - this.buf32[CHAR0_SLOT]; let i = lsegchar; do { this.buf[char1++] = this.buf[--i]; - } while ( i !== 0 ); + } while ( i !== lsegend ); this.buf32[CHAR1_SLOT] = char1; - return (lsegchar << 24) | isegchar; + return isegchar << 8 | lsegchar - lsegend; } growBuf(trieGrow, charGrow) { @@ -724,8 +734,8 @@ HNTrieContainer.prototype.HNTrieRef = class { this.forks.push(idown, this.charPtr); } const v = this.container.buf32[this.icell+2]; - let i0 = this.container.buf32[CHAR0_SLOT] + (v & 0x00FFFFFF); - const i1 = i0 + (v >>> 24); + let i0 = this.container.buf32[CHAR0_SLOT] + (v >>> 8); + const i1 = i0 + (v & 0x7F); while ( i0 < i1 ) { this.charPtr -= 1; this.charBuf[this.charPtr] = this.container.buf[i0]; @@ -795,4 +805,4 @@ const getWasmModule = (( ) => { /******************************************************************************/ -export { HNTrieContainer }; +export default HNTrieContainer; diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 69fce94e6..aba753f8b 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -26,9 +26,9 @@ /******************************************************************************/ import globals from './globals.js'; +import HNTrieContainer from './hntrie.js'; import { sparseBase64 } from './base64-custom.js'; import { BidiTrieContainer } from './biditrie.js'; -import { HNTrieContainer } from './hntrie.js'; import { StaticFilteringParser } from './static-filtering-parser.js'; import { CompiledListReader } from './static-filtering-io.js'; diff --git a/src/js/wasm/hntrie.wasm b/src/js/wasm/hntrie.wasm index 3fdec201bd928fc3d334df2856099a92432ae519..9067f42b4cd30f7cf458ad41afb1be980ae42fb5 100644 GIT binary patch literal 1034 zcmY*YJ#Q015S^Lb`!MGyra_c*Dvw#A2T^Yzu2 z_e6UnUtg^^-8Uh2dB!a4iic9HSG#u~ueJbJmzN|Deizwa=CY4J7m8t88M8cazWfx?my9iN>UdW?#Y;ZrY` z5c}$n<4Hd>8^l^d+TcO_VNSI1@r7T^D0@k9wu`?8>{b#8zSI*P^roaT^K&%Id#2h! zf3n4!Ev^Zf*&>5Dw35sTCCV!wWNQhfHBY&UT<0mi7WK(9n1jS*c&>#RET=d!*mcMa z2<|rjv(5Di6!58@sAZz2^{tbkcL=c8ZZ2hJ*j{C_(8e-Y7)m^i@*JQ+wXZTsPvQ)obc(UI-OPn@E?#Bi8e2CGb8D1=F`1KGJ!kGQlwmHx#c2bV6&pHy z*r=R}m2^>Vs_!fokJ$vSC0RxA?u~gBQWoIo7LLis`gk4nsTt_BDe=S zck?*huX%3gV)q|n{w-ELQX6mCO>BXJieBsjU#XJYDFia5IoWu%vG zE~U0A#o}G(NEHe7QnroPiQ^#AmM`w+_{$URh7m49&GS4^W~c`Jv;#`x;|q-1yj;rLA-w--PYRXjAFj~P2 zh>q%HWhs^V#^<;du;F+BLU64;6sv%>Xhj~aHZ&f!_*4X{mw4>P_zpzkBi}@${9Pel zGl9w0Ba38I=B)5FGkD}`U4Kl-)rBZTy9&S=XMK0ZUUY(rC0HTY(|Y0r}`DyLCl_NUWPz>> 8); + i32.const 8 + i32.shr_u get_local $char0 i32.add tee_local $i0 @@ -130,10 +130,10 @@ end br 0 end end - ;; let n = v >>> 24; + ;; let n = v & 0x7F; get_local $v - i32.const 24 - i32.shr_u + i32.const 0x7F + i32.and tee_local $n ;; if ( n > 1 ) { i32.const 1 @@ -186,21 +186,12 @@ br_if 0 end end - ;; icell = this.buf32[icell+1]; - get_local $icell - i32.load offset=4 - i32.const 2 - i32.shl - tee_local $icell - ;; if ( icell === 0 ) { break; } - i32.eqz - br_if $noSegment - ;; if ( this.buf32[icell+2] === 0 ) { - get_local $icell - i32.load offset=8 - i32.eqz + ;; if ( (v & 0x80) !== 0 ) { + get_local $v + i32.const 0x80 + i32.and if - ;; if ( ineedle === 0 || this.buf[ineedle-1] === 0x2E ) { + ;; if ( ineedle === 0 || buf8[ineedle-1] === 0x2E /* '.' */ ) { ;; return ineedle; ;; } get_local $ineedle @@ -219,32 +210,17 @@ get_local $ineedle return end - ;; icell = this.buf32[icell+1]; - get_local $icell - i32.load offset=4 - i32.const 2 - i32.shl - set_local $icell end - br 0 + ;; icell = this.buf32[icell+1]; + get_local $icell + i32.load offset=4 + i32.const 2 + i32.shl + tee_local $icell + ;; if ( icell === 0 ) { break; } + br_if 0 end end - ;; return ineedle === 0 || this.buf[ineedle-1] === 0x2E ? ineedle : -1; - get_local $ineedle - i32.eqz - if - i32.const 0 - return - end - get_local $ineedle - i32.const -1 - i32.add - i32.load8_u - i32.const 0x2E - i32.eq - if - get_local $ineedle - return - end + ;; return -1; i32.const -1 ) @@ -259,11 +235,12 @@ (local $icell i32) ;; index of current cell in the trie (local $lhnchar i32) ;; number of characters left to process in hostname (local $char0 i32) ;; offset to start of character data section - (local $vseg i32) ;; integer value describing a segment + (local $v i32) ;; integer value describing a segment (local $isegchar0 i32) ;; offset to start of current segment's character data (local $isegchar i32) (local $lsegchar i32) ;; number of character in current segment (local $inext i32) ;; index of next cell to process + (local $boundaryBit i32) ;; the boundary bit state of the current cell ;; ;; let lhnchar = this.buf[255]; i32.const 255 @@ -315,14 +292,11 @@ ;; if ( this.buf32[icell+2] === 0 ) { i32.eqz if - ;; this.buf32[iroot+0] = this.addCell(0, 0, this.addSegment(lhnchar)); + ;; this.buf32[iroot+0] = this.addLeafCell(lhnchar); ;; return 1; get_local $iroot - i32.const 0 - i32.const 0 get_local $lhnchar - call $addSegment - call $addCell + call $addLeafCell i32.store i32.const 1 return @@ -336,35 +310,11 @@ ;; const v = this.buf32[icell+2]; get_local $icell i32.load offset=8 - tee_local $vseg - ;; if ( vseg === 0 ) { - i32.eqz - if - ;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } - get_local $lhnchar - i32.const -1 - i32.add - i32.load8_u - i32.const 0x2E - i32.eq - if - i32.const -1 - return - end - ;; icell = this.buf32[icell+1]; - ;; continue; - get_local $icell - i32.load offset=4 - i32.const 2 - i32.shl - set_local $icell - br $nextSegment - end - ;; let isegchar0 = char0 + (vseg & 0x00FFFFFF); + tee_local $v + ;; let isegchar0 = char0 + (v >>> 8); + i32.const 8 + i32.shr_u get_local $char0 - get_local $vseg - i32.const 0x00FFFFFF - i32.and i32.add tee_local $isegchar0 ;; if ( this.buf[isegchar0] !== this.buf[lhnchar-1] ) { @@ -378,19 +328,14 @@ ;; inext = this.buf32[icell+0]; get_local $icell i32.load - i32.const 2 - i32.shl tee_local $inext ;; if ( inext === 0 ) { i32.eqz if - ;; this.buf32[icell+0] = this.addCell(0, 0, this.addSegment(lhnchar)); + ;; this.buf32[icell+0] = this.addLeafCell(lhnchar); get_local $icell - i32.const 0 - i32.const 0 get_local $lhnchar - call $addSegment - call $addCell + call $addLeafCell i32.store ;; return 1; i32.const 1 @@ -398,6 +343,8 @@ end ;; icell = inext; get_local $inext + i32.const 2 + i32.shl set_local $icell br $nextSegment end @@ -409,10 +356,10 @@ i32.const -1 i32.add set_local $lhnchar - ;; const lsegchar = vseg >>> 24; - get_local $vseg - i32.const 24 - i32.shr_u + ;; const lsegchar = v & 0x7F; + get_local $v + i32.const 0x7F + i32.and tee_local $lsegchar ;; if ( lsegchar !== 1 ) { i32.const 1 @@ -452,82 +399,66 @@ br 0 end end end + ;; const boundaryBit = v & 0x80; + get_local $v + i32.const 0x80 + i32.and + set_local $boundaryBit ;; if ( isegchar === lsegchar ) { get_local $isegchar get_local $lsegchar i32.eq if - ;; inext = this.buf32[icell+1]; - get_local $icell - i32.load offset=4 - i32.const 2 - i32.shl - set_local $inext ;; if ( lhnchar === 0 ) { get_local $lhnchar i32.eqz if - ;; if ( inext === 0 || this.buf32[inext+2] === 0 ) { return 0; } - get_local $inext - i32.eqz + ;; if ( boundaryBit !== 0 ) { return 0; } + get_local $boundaryBit if i32.const 0 return end - get_local $inext - i32.load offset=8 - i32.eqz - if - i32.const 0 - return - end - ;; this.buf32[icell+1] = this.addCell(0, inext, 0); + ;; this.buf32[icell+2] = v | 0x80; get_local $icell - i32.const 0 - get_local $inext - i32.const 2 - i32.shr_u - i32.const 0 - call $addCell - i32.store offset=4 + get_local $v + i32.const 0x80 + i32.or + i32.store offset=8 else + ;; if ( boundaryBit !== 0 ) { + get_local $boundaryBit + if + ;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } + get_local $lhnchar + i32.const -1 + i32.add + i32.load8_u + i32.const 0x2E + i32.eq + if + i32.const -1 + return + end + end + ;; inext = this.buf32[icell+1]; + get_local $icell + i32.load offset=4 + tee_local $inext ;; if ( inext !== 0 ) { - get_local $inext if ;; icell = inext; get_local $inext + i32.const 2 + i32.shl set_local $icell + ;; continue; br $nextSegment end - ;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } - get_local $lhnchar - i32.const -1 - i32.add - i32.load8_u - i32.const 0x2E - i32.eq - if - i32.const -1 - return - end - ;; inext = this.addCell(0, 0, 0); - ;; this.buf32[icell+1] = inext; + ;; this.buf32[icell+1] = this.addLeafCell(lhnchar); get_local $icell - i32.const 0 - i32.const 0 - i32.const 0 - call $addCell - tee_local $inext - i32.store offset=4 - ;; this.buf32[inext+1] = this.addCell(0, 0, this.addSegment(lhnchar)); - get_local $inext - i32.const 2 - i32.shl - i32.const 0 - i32.const 0 get_local $lhnchar - call $addSegment - call $addCell + call $addLeafCell i32.store offset=4 end else @@ -537,56 +468,54 @@ get_local $char0 i32.sub tee_local $isegchar0 - ;; this.buf32[icell+2] = isegchar << 24 | isegchar0; - get_local $isegchar - i32.const 24 + ;; this.buf32[icell+2] = isegchar0 << 8 | isegchar; + i32.const 8 i32.shl + get_local $isegchar i32.or i32.store offset=8 ;; inext = this.addCell( ;; 0, ;; this.buf32[icell+1], - ;; lsegchar - isegchar << 24 | isegchar0 + isegchar + ;; isegchar0 + isegchar << 8 | boundaryBit | lsegchar - isegchar ;; ); - ;; this.buf32[icell+1] = inext; get_local $icell i32.const 0 get_local $icell i32.load offset=4 - get_local $lsegchar - get_local $isegchar - i32.sub - i32.const 24 - i32.shl get_local $isegchar0 get_local $isegchar i32.add + i32.const 8 + i32.shl + get_local $boundaryBit + i32.or + get_local $lsegchar + get_local $isegchar + i32.sub i32.or call $addCell tee_local $inext + ;; this.buf32[icell+1] = inext; i32.store offset=4 - ;; if ( lhnchar === 0 ) { + ;; if ( lhnchar !== 0 ) { get_local $lhnchar - i32.eqz if - ;; this.buf32[icell+1] = this.addCell(0, inext, 0); - get_local $icell - i32.const 0 - get_local $inext - i32.const 0 - call $addCell - i32.store offset=4 - else - ;; this.buf32[inext+0] = this.addCell(0, 0, this.addSegment(lhnchar)); + ;; this.buf32[inext+0] = this.addLeafCell(lhnchar); get_local $inext i32.const 2 i32.shl - i32.const 0 - i32.const 0 get_local $lhnchar - call $addSegment - call $addCell + call $addLeafCell i32.store + else + ;; this.buf32[icell+2] |= 0x80; + get_local $icell + get_local $icell + i32.load offset=8 + i32.const 0x80 + i32.or + i32.store offset=8 end end ;; return 1; @@ -602,14 +531,14 @@ ;; ;; -;; unsigned int addCell(idown, iright, vseg) +;; unsigned int addCell(idown, iright, v) ;; ;; Add a new cell, return cell index. ;; (func $addCell (param $idown i32) (param $iright i32) - (param $vseg i32) + (param $v i32) (result i32) ;; result: index of added cell (local $icell i32) ;; @@ -632,7 +561,7 @@ i32.store offset=4 ;; this.buf32[icell+2] = v; get_local $icell - get_local $vseg + get_local $v i32.store offset=8 ;; return icell; get_local $icell @@ -641,13 +570,96 @@ ) ;; -;; unsigned int addSegment(lsegchar) +;; unsigned int addLeafCell(lsegchar) +;; +;; Add a new cell, return cell index. +;; +(func $addLeafCell + (param $lsegchar i32) + (result i32) ;; result: index of added cell + (local $r i32) + (local $i i32) + ;; const r = this.buf32[TRIE1_SLOT] >>> 2; + i32.const 260 + i32.load + tee_local $r + ;; let i = r; + set_local $i + ;; while ( lsegchar > 127 ) { + block $lastSegment loop + get_local $lsegchar + i32.const 127 + i32.le_u + br_if $lastSegment + ;; this.buf32[i+0] = 0; + get_local $i + i32.const 0 + i32.store + ;; this.buf32[i+1] = i + 3; + get_local $i + get_local $i + i32.const 12 + i32.add + i32.const 2 + i32.shr_u + i32.store offset=4 + ;; this.buf32[i+2] = this.addSegment(lsegchar, lsegchar - 127); + get_local $i + get_local $lsegchar + get_local $lsegchar + i32.const 127 + i32.sub + call $addSegment + i32.store offset=8 + ;; lsegchar -= 127; + get_local $lsegchar + i32.const 127 + i32.sub + set_local $lsegchar + ;; i += 3; + get_local $i + i32.const 12 + i32.add + set_local $i + br 0 + end end + ;; this.buf32[i+0] = 0; + get_local $i + i32.const 0 + i32.store + ;; this.buf32[i+1] = 0; + get_local $i + i32.const 0 + i32.store offset=4 + ;; this.buf32[i+2] = this.addSegment(lsegchar, 0) | 0x80; + get_local $i + get_local $lsegchar + i32.const 0 + call $addSegment + i32.const 0x80 + i32.or + i32.store offset=8 + ;; this.buf32[TRIE1_SLOT] = i + 3 << 2; + i32.const 260 + get_local $i + i32.const 12 + i32.add + i32.store + ;; return r; + get_local $r + i32.const 2 + i32.shr_u +) + +;; +;; unsigned int addSegment(lsegchar, lsegend) ;; ;; Store a segment of characters and return a segment descriptor. The segment ;; is created from the character data in the needle buffer. ;; (func $addSegment (param $lsegchar i32) + (param $lsegend i32) (result i32) ;; result: segment descriptor (local $char1 i32) ;; offset to end of character data section (local $isegchar i32) ;; relative offset to first character of segment @@ -673,7 +685,7 @@ get_local $lsegchar set_local $i ;; do { - block $endOfSegment loop + loop ;; this.buf[char1++] = this.buf[--i]; get_local $char1 get_local $i @@ -686,21 +698,23 @@ i32.const 1 i32.add set_local $char1 - ;; } while ( i !== 0 ); + ;; } while ( i !== lsegend ); get_local $i - i32.eqz - br_if $endOfSegment - br 0 - end end + get_local $lsegend + i32.ne + br_if 0 + end ;; this.buf32[HNBIGTRIE_CHAR1_SLOT] = char1; i32.const 268 get_local $char1 i32.store - ;; return (lsegchar << 24) | isegchar; - get_local $lsegchar - i32.const 24 - i32.shl + ;; return isegchar << 8 | lsegchar - lsegend; get_local $isegchar + i32.const 8 + i32.shl + get_local $lsegchar + get_local $lsegend + i32.sub i32.or )