1
0
mirror of https://github.com/gorhill/uBlock.git synced 2024-10-04 16:47:15 +02:00

Refactor hntrie to avoid the need for boundary cells

Whereas before the string segment was encoded as:

LL OOOOOOOOOOOO

where L are the upper 8 bits and used to encode the length
of the segment, and O are the lower 24 bits and used to
encode the offset of the string data in the character
buffer, the new code encode as follow:

OOOOOOOOOOOO LL

And furthermore the most significant bit of the length
LL is now used to mark whether the current string segment
is a label boundary.

This means a cell can't reference a segment longer then
127 characters. To work around this limitation for when a
segment is longer than 127 characters (a rare occurrence),
the algorithm will simply split the segment into multiple
adjacent cells.

As a result, there is no longer a need to encode
"boundariness" into special cells, which simplifies
both the storing and matching algorithms.

Additionally, added minimal documentation for the NPM
package on how to import and use HNTrieContainer as a
standalone API.
This commit is contained in:
Raymond Hill 2021-08-10 09:27:59 -04:00
parent a3f430ef03
commit c6fb70b1f0
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
8 changed files with 375 additions and 254 deletions

View File

@ -94,3 +94,65 @@ It is possible to pre-parse filter lists and save the intermediate results for
later use -- useful to speed up the loading of filter lists. This will be later use -- useful to speed up the loading of filter lists. This will be
documented eventually, but if you feel adventurous, you can look at the code documented eventually, but if you feel adventurous, you can look at the code
and use this capability now if you figure out the details. and use this capability now if you figure out the details.
---
## Extras
You can directly use specific APIs exposed by this package, here are some of
them, which are used internally by uBO's SNFE.
### `HNTrieContainer`
A well optimised [compressed trie](https://en.wikipedia.org/wiki/Trie#Compressing_tries)
container specialized to specifically store and lookup hostnames.
The matching algorithm is designed for hostnames, i.e. the hostname labels
making up a hostname are matched from right to left, such that `www.example.org`
with be a match if `example.org` is stored into the trie, while
`anotherexample.org` won't be a match.
`HNTrieContainer` is designed to store a large number of hostnames with CPU and
memory efficiency as a main concern -- and is a key component of uBO.
To create and use a standalone `HNTrieContainer` object:
```js
import HNTrieContainer from '@gorhill/ubo-core/js/hntrie.js';
const trieContainer = new HNTrieContainer();
const aTrie = trieContainer.createOne();
aTrie.add('example.org');
aTrie.add('example.com');
const anotherTrie = trieContainer.createOne();
anotherTrie.add('foo.invalid');
anotherTrie.add('bar.invalid');
// matches() return the position at which the match starts, or -1 when
// there is no match.
// Matches: return 4
console.log("aTrie.matches('www.example.org')", aTrie.matches('www.example.org'));
// Does not match: return -1
console.log("aTrie.matches('www.foo.invalid')", aTrie.matches('www.foo.invalid'));
// Does not match: return -1
console.log("anotherTrie.matches('www.example.org')", anotherTrie.matches('www.example.org'));
// Matches: return 0
console.log("anotherTrie.matches('foo.invalid')", anotherTrie.matches('foo.invalid'));
```
The `reset()` method must be used to remove all the tries from a trie container,
you can't remove a single trie from the container.
```js
hntrieContainer.reset();
```
When you reset a trie container, you can't use the reference to prior instances
of trie, i.e. `aTrie` and `anotherTrie` are no longer valid and shouldn't be
used following a reset.

View File

@ -1,6 +1,6 @@
{ {
"name": "@gorhill/ubo-core", "name": "@gorhill/ubo-core",
"version": "0.1.7", "version": "0.1.8",
"description": "To create a working instance of uBlock Origin's static network filtering engine", "description": "To create a working instance of uBlock Origin's static network filtering engine",
"type": "module", "type": "module",
"main": "index.js", "main": "index.js",
@ -15,7 +15,8 @@
"keywords": [ "keywords": [
"uBlock", "uBlock",
"uBO", "uBO",
"adblock" "adblock",
"trie"
], ],
"author": "Raymond Hill", "author": "Raymond Hill",
"license": "GPL-3.0-or-later", "license": "GPL-3.0-or-later",

View File

@ -33,6 +33,8 @@ import {
StaticNetFilteringEngine, StaticNetFilteringEngine,
} from './index.js'; } from './index.js';
import HNTrieContainer from './js/hntrie.js';
/******************************************************************************/ /******************************************************************************/
function fetch(listName) { function fetch(listName) {
@ -42,7 +44,7 @@ function fetch(listName) {
}); });
} }
function runTests(engine) { function testSNFE(engine) {
let result = 0; let result = 0;
// Tests // Tests
@ -77,6 +79,53 @@ function runTests(engine) {
} }
} }
async function doSNFE() {
const engine = await StaticNetFilteringEngine.create();
await engine.useLists([
fetch('easylist').then(raw => ({ name: 'easylist', raw })),
fetch('easyprivacy').then(raw => ({ name: 'easyprivacy', raw })),
]);
testSNFE(engine);
const serialized = await engine.serialize();
engine.useLists([]);
testSNFE(engine);
await engine.deserialize(serialized);
testSNFE(engine);
}
async function doHNTrie() {
const trieContainer = new HNTrieContainer();
const aTrie = trieContainer.createOne();
aTrie.add('example.org');
aTrie.add('example.com');
const anotherTrie = trieContainer.createOne();
anotherTrie.add('foo.invalid');
anotherTrie.add('bar.invalid');
// matches() return the position at which the match starts, or -1 when
// there is no match.
// Matches: return 4
console.log("aTrie.matches('www.example.org')", aTrie.matches('www.example.org'));
// Does not match: return -1
console.log("aTrie.matches('www.foo.invalid')", aTrie.matches('www.foo.invalid'));
// Does not match: return -1
console.log("anotherTrie.matches('www.example.org')", anotherTrie.matches('www.example.org'));
// Matches: return 0
console.log("anotherTrie.matches('foo.invalid')", anotherTrie.matches('foo.invalid'));
}
async function main() { async function main() {
try { try {
const result = await enableWASM(); const result = await enableWASM();
@ -87,23 +136,8 @@ async function main() {
console.log(ex); console.log(ex);
} }
const engine = await StaticNetFilteringEngine.create(); await doSNFE();
await doHNTrie();
await engine.useLists([
fetch('easylist').then(raw => ({ name: 'easylist', raw })),
fetch('easyprivacy').then(raw => ({ name: 'easyprivacy', raw })),
]);
runTests(engine);
const serialized = await engine.serialize();
engine.useLists([]);
runTests(engine);
await engine.deserialize(serialized);
runTests(engine);
process.exit(); process.exit();
} }

View File

@ -155,7 +155,7 @@ const µBlock = { // jshint ignore:line
// Read-only // Read-only
systemSettings: { systemSettings: {
compiledMagic: 37, // Increase when compiled format changes compiledMagic: 37, // Increase when compiled format changes
selfieMagic: 37, // Increase when selfie format changes selfieMagic: 38, // Increase when selfie format changes
}, },
// https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501 // https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501

View File

@ -184,21 +184,21 @@ const HNTrieContainer = class {
let ineedle = buf8[255]; let ineedle = buf8[255];
let icell = buf32[iroot+0]; let icell = buf32[iroot+0];
if ( icell === 0 ) { return -1; } if ( icell === 0 ) { return -1; }
let c = 0, v = 0, i0 = 0, n = 0;
for (;;) { for (;;) {
if ( ineedle === 0 ) { return -1; } if ( ineedle === 0 ) { return -1; }
ineedle -= 1; ineedle -= 1;
let c = buf8[ineedle]; c = buf8[ineedle];
let v, i0;
// find first segment with a first-character match // find first segment with a first-character match
for (;;) { for (;;) {
v = buf32[icell+2]; v = buf32[icell+2];
i0 = char0 + (v & 0x00FFFFFF); i0 = char0 + (v >>> 8);
if ( buf8[i0] === c ) { break; } if ( buf8[i0] === c ) { break; }
icell = buf32[icell+0]; icell = buf32[icell+0];
if ( icell === 0 ) { return -1; } if ( icell === 0 ) { return -1; }
} }
// all characters in segment must match // all characters in segment must match
let n = v >>> 24; n = v & 0x7F;
if ( n > 1 ) { if ( n > 1 ) {
n -= 1; n -= 1;
if ( n > ineedle ) { return -1; } if ( n > ineedle ) { return -1; }
@ -210,17 +210,17 @@ const HNTrieContainer = class {
i0 += 1; i0 += 1;
} while ( i0 < i1 ); } while ( i0 < i1 );
} }
// boundary at end of segment?
if ( (v & 0x80) !== 0 ) {
if ( ineedle === 0 || buf8[ineedle-1] === 0x2E /* '.' */ ) {
return ineedle;
}
}
// next segment // next segment
icell = buf32[icell+1]; icell = buf32[icell+1];
if ( icell === 0 ) { break; } if ( icell === 0 ) { break; }
if ( buf32[icell+2] === 0 ) {
if ( ineedle === 0 || buf8[ineedle-1] === 0x2E ) {
return ineedle;
}
icell = buf32[icell+1];
}
} }
return ineedle === 0 || buf8[ineedle-1] === 0x2E ? ineedle : -1; return -1;
} }
createOne(args) { createOne(args) {
@ -256,39 +256,31 @@ const HNTrieContainer = class {
let icell = this.buf32[iroot+0]; let icell = this.buf32[iroot+0];
// special case: first node in trie // special case: first node in trie
if ( icell === 0 ) { if ( icell === 0 ) {
this.buf32[iroot+0] = this.addCell(0, 0, this.addSegment(lhnchar)); this.buf32[iroot+0] = this.addLeafCell(lhnchar);
return 1; return 1;
} }
// //
const char0 = this.buf32[CHAR0_SLOT]; const char0 = this.buf32[CHAR0_SLOT];
let inext; let isegchar, lsegchar, boundaryBit, inext;
// find a matching cell: move down // find a matching cell: move down
for (;;) { for (;;) {
const vseg = this.buf32[icell+2]; const v = this.buf32[icell+2];
// skip boundary cells let isegchar0 = char0 + (v >>> 8);
if ( vseg === 0 ) {
// remainder is at label boundary? if yes, no need to add
// the rest since the shortest match is always reported
if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
icell = this.buf32[icell+1];
continue;
}
let isegchar0 = char0 + (vseg & 0x00FFFFFF);
// if first character is no match, move to next descendant // if first character is no match, move to next descendant
if ( this.buf[isegchar0] !== this.buf[lhnchar-1] ) { if ( this.buf[isegchar0] !== this.buf[lhnchar-1] ) {
inext = this.buf32[icell+0]; inext = this.buf32[icell+0];
if ( inext === 0 ) { if ( inext === 0 ) {
this.buf32[icell+0] = this.addCell(0, 0, this.addSegment(lhnchar)); this.buf32[icell+0] = this.addLeafCell(lhnchar);
return 1; return 1;
} }
icell = inext; icell = inext;
continue; continue;
} }
// 1st character was tested // 1st character was tested
let isegchar = 1; isegchar = 1;
lhnchar -= 1; lhnchar -= 1;
// find 1st mismatch in rest of segment // find 1st mismatch in rest of segment
const lsegchar = vseg >>> 24; lsegchar = v & 0x7F;
if ( lsegchar !== 1 ) { if ( lsegchar !== 1 ) {
for (;;) { for (;;) {
if ( isegchar === lsegchar ) { break; } if ( isegchar === lsegchar ) { break; }
@ -298,49 +290,50 @@ const HNTrieContainer = class {
lhnchar -= 1; lhnchar -= 1;
} }
} }
boundaryBit = v & 0x80;
// all segment characters matched // all segment characters matched
if ( isegchar === lsegchar ) { if ( isegchar === lsegchar ) {
inext = this.buf32[icell+1];
// needle remainder: no // needle remainder: no
if ( lhnchar === 0 ) { if ( lhnchar === 0 ) {
// boundary cell already present // boundary: yes, already present
if ( inext === 0 || this.buf32[inext+2] === 0 ) { return 0; } if ( boundaryBit !== 0 ) { return 0; }
// need boundary cell // boundary: no, mark as boundary
this.buf32[icell+1] = this.addCell(0, inext, 0); this.buf32[icell+2] = v | 0x80;
} }
// needle remainder: yes // needle remainder: yes
else { else {
// remainder is at label boundary? if yes, no need to add
// the rest since the shortest match is always reported
if ( boundaryBit !== 0 ) {
if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
}
inext = this.buf32[icell+1];
if ( inext !== 0 ) { if ( inext !== 0 ) {
icell = inext; icell = inext;
continue; continue;
} }
// remainder is at label boundary? if yes, no need to add // add needle remainder
// the rest since the shortest match is always reported this.buf32[icell+1] = this.addLeafCell(lhnchar);
if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
// boundary cell + needle remainder
inext = this.addCell(0, 0, 0);
this.buf32[icell+1] = inext;
this.buf32[inext+1] = this.addCell(0, 0, this.addSegment(lhnchar));
} }
} }
// some segment characters matched // some segment characters matched
else { else {
// split current cell // split current cell
isegchar0 -= char0; isegchar0 -= char0;
this.buf32[icell+2] = isegchar << 24 | isegchar0; this.buf32[icell+2] = isegchar0 << 8 | isegchar;
inext = this.addCell( inext = this.addCell(
0, 0,
this.buf32[icell+1], this.buf32[icell+1],
lsegchar - isegchar << 24 | isegchar0 + isegchar isegchar0 + isegchar << 8 | boundaryBit | lsegchar - isegchar
); );
this.buf32[icell+1] = inext; this.buf32[icell+1] = inext;
// needle remainder: no = need boundary cell // needle remainder: yes, need new cell for remaining characters
if ( lhnchar === 0 ) { if ( lhnchar !== 0 ) {
this.buf32[icell+1] = this.addCell(0, inext, 0); this.buf32[inext+0] = this.addLeafCell(lhnchar);
} }
// needle remainder: yes = need new cell for remaining characters // needle remainder: no, need boundary cell
else { else {
this.buf32[inext+0] = this.addCell(0, 0, this.addSegment(lhnchar)); this.buf32[icell+2] |= 0x80;
} }
} }
return 1; return 1;
@ -459,9 +452,9 @@ const HNTrieContainer = class {
async enableWASM(wasmModuleFetcher, path) { async enableWASM(wasmModuleFetcher, path) {
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/globalThis // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/globalThis
const globals = (( ) => { const globals = (( ) => {
if ( typeof self !== 'undefined' ) { return self; }
// jshint ignore:start // jshint ignore:start
if ( typeof globalThis !== 'undefined' ) { return globalThis; } if ( typeof globalThis !== 'undefined' ) { return globalThis; }
if ( typeof self !== 'undefined' ) { return self; }
if ( typeof global !== 'undefined' ) { return global; } if ( typeof global !== 'undefined' ) { return global; }
// jshint ignore:end // jshint ignore:end
return {}; return {};
@ -508,16 +501,33 @@ const HNTrieContainer = class {
return icell; return icell;
} }
addSegment(lsegchar) { addLeafCell(lsegchar) {
const r = this.buf32[TRIE1_SLOT] >>> 2;
let i = r;
while ( lsegchar > 127 ) {
this.buf32[i+0] = 0;
this.buf32[i+1] = i + 3;
this.buf32[i+2] = this.addSegment(lsegchar, lsegchar - 127);
lsegchar -= 127;
i += 3;
}
this.buf32[i+0] = 0;
this.buf32[i+1] = 0;
this.buf32[i+2] = this.addSegment(lsegchar, 0) | 0x80;
this.buf32[TRIE1_SLOT] = i + 3 << 2;
return r;
}
addSegment(lsegchar, lsegend) {
if ( lsegchar === 0 ) { return 0; } if ( lsegchar === 0 ) { return 0; }
let char1 = this.buf32[CHAR1_SLOT]; let char1 = this.buf32[CHAR1_SLOT];
const isegchar = char1 - this.buf32[CHAR0_SLOT]; const isegchar = char1 - this.buf32[CHAR0_SLOT];
let i = lsegchar; let i = lsegchar;
do { do {
this.buf[char1++] = this.buf[--i]; this.buf[char1++] = this.buf[--i];
} while ( i !== 0 ); } while ( i !== lsegend );
this.buf32[CHAR1_SLOT] = char1; this.buf32[CHAR1_SLOT] = char1;
return (lsegchar << 24) | isegchar; return isegchar << 8 | lsegchar - lsegend;
} }
growBuf(trieGrow, charGrow) { growBuf(trieGrow, charGrow) {
@ -724,8 +734,8 @@ HNTrieContainer.prototype.HNTrieRef = class {
this.forks.push(idown, this.charPtr); this.forks.push(idown, this.charPtr);
} }
const v = this.container.buf32[this.icell+2]; const v = this.container.buf32[this.icell+2];
let i0 = this.container.buf32[CHAR0_SLOT] + (v & 0x00FFFFFF); let i0 = this.container.buf32[CHAR0_SLOT] + (v >>> 8);
const i1 = i0 + (v >>> 24); const i1 = i0 + (v & 0x7F);
while ( i0 < i1 ) { while ( i0 < i1 ) {
this.charPtr -= 1; this.charPtr -= 1;
this.charBuf[this.charPtr] = this.container.buf[i0]; this.charBuf[this.charPtr] = this.container.buf[i0];
@ -795,4 +805,4 @@ const getWasmModule = (( ) => {
/******************************************************************************/ /******************************************************************************/
export { HNTrieContainer }; export default HNTrieContainer;

View File

@ -26,9 +26,9 @@
/******************************************************************************/ /******************************************************************************/
import globals from './globals.js'; import globals from './globals.js';
import HNTrieContainer from './hntrie.js';
import { sparseBase64 } from './base64-custom.js'; import { sparseBase64 } from './base64-custom.js';
import { BidiTrieContainer } from './biditrie.js'; import { BidiTrieContainer } from './biditrie.js';
import { HNTrieContainer } from './hntrie.js';
import { StaticFilteringParser } from './static-filtering-parser.js'; import { StaticFilteringParser } from './static-filtering-parser.js';
import { CompiledListReader } from './static-filtering-io.js'; import { CompiledListReader } from './static-filtering-io.js';

Binary file not shown.

View File

@ -106,9 +106,9 @@
get_local $icell get_local $icell
i32.load offset=8 i32.load offset=8
tee_local $v tee_local $v
;; i0 = this.char0 + (v & 0x00FFFFFF); ;; i0 = char0 + (v >>> 8);
i32.const 0x00FFFFFF i32.const 8
i32.and i32.shr_u
get_local $char0 get_local $char0
i32.add i32.add
tee_local $i0 tee_local $i0
@ -130,10 +130,10 @@
end end
br 0 br 0
end end end end
;; let n = v >>> 24; ;; let n = v & 0x7F;
get_local $v get_local $v
i32.const 24 i32.const 0x7F
i32.shr_u i32.and
tee_local $n tee_local $n
;; if ( n > 1 ) { ;; if ( n > 1 ) {
i32.const 1 i32.const 1
@ -186,21 +186,12 @@
br_if 0 br_if 0
end end
end end
;; icell = this.buf32[icell+1]; ;; if ( (v & 0x80) !== 0 ) {
get_local $icell get_local $v
i32.load offset=4 i32.const 0x80
i32.const 2 i32.and
i32.shl
tee_local $icell
;; if ( icell === 0 ) { break; }
i32.eqz
br_if $noSegment
;; if ( this.buf32[icell+2] === 0 ) {
get_local $icell
i32.load offset=8
i32.eqz
if if
;; if ( ineedle === 0 || this.buf[ineedle-1] === 0x2E ) { ;; if ( ineedle === 0 || buf8[ineedle-1] === 0x2E /* '.' */ ) {
;; return ineedle; ;; return ineedle;
;; } ;; }
get_local $ineedle get_local $ineedle
@ -219,32 +210,17 @@
get_local $ineedle get_local $ineedle
return return
end end
;; icell = this.buf32[icell+1];
get_local $icell
i32.load offset=4
i32.const 2
i32.shl
set_local $icell
end end
br 0 ;; icell = this.buf32[icell+1];
get_local $icell
i32.load offset=4
i32.const 2
i32.shl
tee_local $icell
;; if ( icell === 0 ) { break; }
br_if 0
end end end end
;; return ineedle === 0 || this.buf[ineedle-1] === 0x2E ? ineedle : -1; ;; return -1;
get_local $ineedle
i32.eqz
if
i32.const 0
return
end
get_local $ineedle
i32.const -1
i32.add
i32.load8_u
i32.const 0x2E
i32.eq
if
get_local $ineedle
return
end
i32.const -1 i32.const -1
) )
@ -259,11 +235,12 @@
(local $icell i32) ;; index of current cell in the trie (local $icell i32) ;; index of current cell in the trie
(local $lhnchar i32) ;; number of characters left to process in hostname (local $lhnchar i32) ;; number of characters left to process in hostname
(local $char0 i32) ;; offset to start of character data section (local $char0 i32) ;; offset to start of character data section
(local $vseg i32) ;; integer value describing a segment (local $v i32) ;; integer value describing a segment
(local $isegchar0 i32) ;; offset to start of current segment's character data (local $isegchar0 i32) ;; offset to start of current segment's character data
(local $isegchar i32) (local $isegchar i32)
(local $lsegchar i32) ;; number of character in current segment (local $lsegchar i32) ;; number of character in current segment
(local $inext i32) ;; index of next cell to process (local $inext i32) ;; index of next cell to process
(local $boundaryBit i32) ;; the boundary bit state of the current cell
;; ;;
;; let lhnchar = this.buf[255]; ;; let lhnchar = this.buf[255];
i32.const 255 i32.const 255
@ -315,14 +292,11 @@
;; if ( this.buf32[icell+2] === 0 ) { ;; if ( this.buf32[icell+2] === 0 ) {
i32.eqz i32.eqz
if if
;; this.buf32[iroot+0] = this.addCell(0, 0, this.addSegment(lhnchar)); ;; this.buf32[iroot+0] = this.addLeafCell(lhnchar);
;; return 1; ;; return 1;
get_local $iroot get_local $iroot
i32.const 0
i32.const 0
get_local $lhnchar get_local $lhnchar
call $addSegment call $addLeafCell
call $addCell
i32.store i32.store
i32.const 1 i32.const 1
return return
@ -336,35 +310,11 @@
;; const v = this.buf32[icell+2]; ;; const v = this.buf32[icell+2];
get_local $icell get_local $icell
i32.load offset=8 i32.load offset=8
tee_local $vseg tee_local $v
;; if ( vseg === 0 ) { ;; let isegchar0 = char0 + (v >>> 8);
i32.eqz i32.const 8
if i32.shr_u
;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
get_local $lhnchar
i32.const -1
i32.add
i32.load8_u
i32.const 0x2E
i32.eq
if
i32.const -1
return
end
;; icell = this.buf32[icell+1];
;; continue;
get_local $icell
i32.load offset=4
i32.const 2
i32.shl
set_local $icell
br $nextSegment
end
;; let isegchar0 = char0 + (vseg & 0x00FFFFFF);
get_local $char0 get_local $char0
get_local $vseg
i32.const 0x00FFFFFF
i32.and
i32.add i32.add
tee_local $isegchar0 tee_local $isegchar0
;; if ( this.buf[isegchar0] !== this.buf[lhnchar-1] ) { ;; if ( this.buf[isegchar0] !== this.buf[lhnchar-1] ) {
@ -378,19 +328,14 @@
;; inext = this.buf32[icell+0]; ;; inext = this.buf32[icell+0];
get_local $icell get_local $icell
i32.load i32.load
i32.const 2
i32.shl
tee_local $inext tee_local $inext
;; if ( inext === 0 ) { ;; if ( inext === 0 ) {
i32.eqz i32.eqz
if if
;; this.buf32[icell+0] = this.addCell(0, 0, this.addSegment(lhnchar)); ;; this.buf32[icell+0] = this.addLeafCell(lhnchar);
get_local $icell get_local $icell
i32.const 0
i32.const 0
get_local $lhnchar get_local $lhnchar
call $addSegment call $addLeafCell
call $addCell
i32.store i32.store
;; return 1; ;; return 1;
i32.const 1 i32.const 1
@ -398,6 +343,8 @@
end end
;; icell = inext; ;; icell = inext;
get_local $inext get_local $inext
i32.const 2
i32.shl
set_local $icell set_local $icell
br $nextSegment br $nextSegment
end end
@ -409,10 +356,10 @@
i32.const -1 i32.const -1
i32.add i32.add
set_local $lhnchar set_local $lhnchar
;; const lsegchar = vseg >>> 24; ;; const lsegchar = v & 0x7F;
get_local $vseg get_local $v
i32.const 24 i32.const 0x7F
i32.shr_u i32.and
tee_local $lsegchar tee_local $lsegchar
;; if ( lsegchar !== 1 ) { ;; if ( lsegchar !== 1 ) {
i32.const 1 i32.const 1
@ -452,82 +399,66 @@
br 0 br 0
end end end end
end end
;; const boundaryBit = v & 0x80;
get_local $v
i32.const 0x80
i32.and
set_local $boundaryBit
;; if ( isegchar === lsegchar ) { ;; if ( isegchar === lsegchar ) {
get_local $isegchar get_local $isegchar
get_local $lsegchar get_local $lsegchar
i32.eq i32.eq
if if
;; inext = this.buf32[icell+1];
get_local $icell
i32.load offset=4
i32.const 2
i32.shl
set_local $inext
;; if ( lhnchar === 0 ) { ;; if ( lhnchar === 0 ) {
get_local $lhnchar get_local $lhnchar
i32.eqz i32.eqz
if if
;; if ( inext === 0 || this.buf32[inext+2] === 0 ) { return 0; } ;; if ( boundaryBit !== 0 ) { return 0; }
get_local $inext get_local $boundaryBit
i32.eqz
if if
i32.const 0 i32.const 0
return return
end end
get_local $inext ;; this.buf32[icell+2] = v | 0x80;
i32.load offset=8
i32.eqz
if
i32.const 0
return
end
;; this.buf32[icell+1] = this.addCell(0, inext, 0);
get_local $icell get_local $icell
i32.const 0 get_local $v
get_local $inext i32.const 0x80
i32.const 2 i32.or
i32.shr_u i32.store offset=8
i32.const 0
call $addCell
i32.store offset=4
else else
;; if ( boundaryBit !== 0 ) {
get_local $boundaryBit
if
;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
get_local $lhnchar
i32.const -1
i32.add
i32.load8_u
i32.const 0x2E
i32.eq
if
i32.const -1
return
end
end
;; inext = this.buf32[icell+1];
get_local $icell
i32.load offset=4
tee_local $inext
;; if ( inext !== 0 ) { ;; if ( inext !== 0 ) {
get_local $inext
if if
;; icell = inext; ;; icell = inext;
get_local $inext get_local $inext
i32.const 2
i32.shl
set_local $icell set_local $icell
;; continue;
br $nextSegment br $nextSegment
end end
;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; } ;; this.buf32[icell+1] = this.addLeafCell(lhnchar);
get_local $lhnchar
i32.const -1
i32.add
i32.load8_u
i32.const 0x2E
i32.eq
if
i32.const -1
return
end
;; inext = this.addCell(0, 0, 0);
;; this.buf32[icell+1] = inext;
get_local $icell get_local $icell
i32.const 0
i32.const 0
i32.const 0
call $addCell
tee_local $inext
i32.store offset=4
;; this.buf32[inext+1] = this.addCell(0, 0, this.addSegment(lhnchar));
get_local $inext
i32.const 2
i32.shl
i32.const 0
i32.const 0
get_local $lhnchar get_local $lhnchar
call $addSegment call $addLeafCell
call $addCell
i32.store offset=4 i32.store offset=4
end end
else else
@ -537,56 +468,54 @@
get_local $char0 get_local $char0
i32.sub i32.sub
tee_local $isegchar0 tee_local $isegchar0
;; this.buf32[icell+2] = isegchar << 24 | isegchar0; ;; this.buf32[icell+2] = isegchar0 << 8 | isegchar;
get_local $isegchar i32.const 8
i32.const 24
i32.shl i32.shl
get_local $isegchar
i32.or i32.or
i32.store offset=8 i32.store offset=8
;; inext = this.addCell( ;; inext = this.addCell(
;; 0, ;; 0,
;; this.buf32[icell+1], ;; this.buf32[icell+1],
;; lsegchar - isegchar << 24 | isegchar0 + isegchar ;; isegchar0 + isegchar << 8 | boundaryBit | lsegchar - isegchar
;; ); ;; );
;; this.buf32[icell+1] = inext;
get_local $icell get_local $icell
i32.const 0 i32.const 0
get_local $icell get_local $icell
i32.load offset=4 i32.load offset=4
get_local $lsegchar
get_local $isegchar
i32.sub
i32.const 24
i32.shl
get_local $isegchar0 get_local $isegchar0
get_local $isegchar get_local $isegchar
i32.add i32.add
i32.const 8
i32.shl
get_local $boundaryBit
i32.or
get_local $lsegchar
get_local $isegchar
i32.sub
i32.or i32.or
call $addCell call $addCell
tee_local $inext tee_local $inext
;; this.buf32[icell+1] = inext;
i32.store offset=4 i32.store offset=4
;; if ( lhnchar === 0 ) { ;; if ( lhnchar !== 0 ) {
get_local $lhnchar get_local $lhnchar
i32.eqz
if if
;; this.buf32[icell+1] = this.addCell(0, inext, 0); ;; this.buf32[inext+0] = this.addLeafCell(lhnchar);
get_local $icell
i32.const 0
get_local $inext
i32.const 0
call $addCell
i32.store offset=4
else
;; this.buf32[inext+0] = this.addCell(0, 0, this.addSegment(lhnchar));
get_local $inext get_local $inext
i32.const 2 i32.const 2
i32.shl i32.shl
i32.const 0
i32.const 0
get_local $lhnchar get_local $lhnchar
call $addSegment call $addLeafCell
call $addCell
i32.store i32.store
else
;; this.buf32[icell+2] |= 0x80;
get_local $icell
get_local $icell
i32.load offset=8
i32.const 0x80
i32.or
i32.store offset=8
end end
end end
;; return 1; ;; return 1;
@ -602,14 +531,14 @@
;; ;;
;; ;;
;; unsigned int addCell(idown, iright, vseg) ;; unsigned int addCell(idown, iright, v)
;; ;;
;; Add a new cell, return cell index. ;; Add a new cell, return cell index.
;; ;;
(func $addCell (func $addCell
(param $idown i32) (param $idown i32)
(param $iright i32) (param $iright i32)
(param $vseg i32) (param $v i32)
(result i32) ;; result: index of added cell (result i32) ;; result: index of added cell
(local $icell i32) (local $icell i32)
;; ;;
@ -632,7 +561,7 @@
i32.store offset=4 i32.store offset=4
;; this.buf32[icell+2] = v; ;; this.buf32[icell+2] = v;
get_local $icell get_local $icell
get_local $vseg get_local $v
i32.store offset=8 i32.store offset=8
;; return icell; ;; return icell;
get_local $icell get_local $icell
@ -641,13 +570,96 @@
) )
;; ;;
;; unsigned int addSegment(lsegchar) ;; unsigned int addLeafCell(lsegchar)
;;
;; Add a new cell, return cell index.
;;
(func $addLeafCell
(param $lsegchar i32)
(result i32) ;; result: index of added cell
(local $r i32)
(local $i i32)
;; const r = this.buf32[TRIE1_SLOT] >>> 2;
i32.const 260
i32.load
tee_local $r
;; let i = r;
set_local $i
;; while ( lsegchar > 127 ) {
block $lastSegment loop
get_local $lsegchar
i32.const 127
i32.le_u
br_if $lastSegment
;; this.buf32[i+0] = 0;
get_local $i
i32.const 0
i32.store
;; this.buf32[i+1] = i + 3;
get_local $i
get_local $i
i32.const 12
i32.add
i32.const 2
i32.shr_u
i32.store offset=4
;; this.buf32[i+2] = this.addSegment(lsegchar, lsegchar - 127);
get_local $i
get_local $lsegchar
get_local $lsegchar
i32.const 127
i32.sub
call $addSegment
i32.store offset=8
;; lsegchar -= 127;
get_local $lsegchar
i32.const 127
i32.sub
set_local $lsegchar
;; i += 3;
get_local $i
i32.const 12
i32.add
set_local $i
br 0
end end
;; this.buf32[i+0] = 0;
get_local $i
i32.const 0
i32.store
;; this.buf32[i+1] = 0;
get_local $i
i32.const 0
i32.store offset=4
;; this.buf32[i+2] = this.addSegment(lsegchar, 0) | 0x80;
get_local $i
get_local $lsegchar
i32.const 0
call $addSegment
i32.const 0x80
i32.or
i32.store offset=8
;; this.buf32[TRIE1_SLOT] = i + 3 << 2;
i32.const 260
get_local $i
i32.const 12
i32.add
i32.store
;; return r;
get_local $r
i32.const 2
i32.shr_u
)
;;
;; unsigned int addSegment(lsegchar, lsegend)
;; ;;
;; Store a segment of characters and return a segment descriptor. The segment ;; Store a segment of characters and return a segment descriptor. The segment
;; is created from the character data in the needle buffer. ;; is created from the character data in the needle buffer.
;; ;;
(func $addSegment (func $addSegment
(param $lsegchar i32) (param $lsegchar i32)
(param $lsegend i32)
(result i32) ;; result: segment descriptor (result i32) ;; result: segment descriptor
(local $char1 i32) ;; offset to end of character data section (local $char1 i32) ;; offset to end of character data section
(local $isegchar i32) ;; relative offset to first character of segment (local $isegchar i32) ;; relative offset to first character of segment
@ -673,7 +685,7 @@
get_local $lsegchar get_local $lsegchar
set_local $i set_local $i
;; do { ;; do {
block $endOfSegment loop loop
;; this.buf[char1++] = this.buf[--i]; ;; this.buf[char1++] = this.buf[--i];
get_local $char1 get_local $char1
get_local $i get_local $i
@ -686,21 +698,23 @@
i32.const 1 i32.const 1
i32.add i32.add
set_local $char1 set_local $char1
;; } while ( i !== 0 ); ;; } while ( i !== lsegend );
get_local $i get_local $i
i32.eqz get_local $lsegend
br_if $endOfSegment i32.ne
br 0 br_if 0
end end end
;; this.buf32[HNBIGTRIE_CHAR1_SLOT] = char1; ;; this.buf32[HNBIGTRIE_CHAR1_SLOT] = char1;
i32.const 268 i32.const 268
get_local $char1 get_local $char1
i32.store i32.store
;; return (lsegchar << 24) | isegchar; ;; return isegchar << 8 | lsegchar - lsegend;
get_local $lsegchar
i32.const 24
i32.shl
get_local $isegchar get_local $isegchar
i32.const 8
i32.shl
get_local $lsegchar
get_local $lsegend
i32.sub
i32.or i32.or
) )