1
0
mirror of https://github.com/c9fe/22120.git synced 2024-09-19 23:21:45 +02:00
This commit is contained in:
Cris Stringfellow 2021-12-25 14:32:22 +08:00
parent a5f00b0dc0
commit 963a750878
3 changed files with 150 additions and 14 deletions

View File

@ -25,8 +25,8 @@
import { query as NDXQuery } from 'ndx-query';
import { toSerializable, fromSerializable } from 'ndx-serializable';
//import { DocumentIndex } from 'ndx';
//import Fuzzy from 'fz-search';
import * as _Fuzzy from './lib/fz.js';
import Fuzzy from 'fz-search';
//import * as _Fuzzy from './lib/fz.js';
import Nat from 'natural';
import args from './args.js';
@ -44,7 +44,8 @@
// search related state: constants and variables
// common
const Fuzzy = globalThis.FuzzySearch;
const STRIP_CHARS = /[\u0001-\u001a\0\v\f\r\t\n]/g;
//const Fuzzy = globalThis.FuzzySearch;
const NDX_OLD = false;
const USE_FLEX = true;
const FTS_INDEX_DIR = args.fts_index_dir;
@ -413,7 +414,7 @@ export default Archivist;
const flatDoc = await send("DOMSnapshot.captureSnapshot", {
computedStyles: [],
}, sessionId);
const pageText = processDoc(flatDoc).replace(/\t\n/g, ' ');
const pageText = processDoc(flatDoc).replace(STRIP_CHARS, ' ');
const {title, url} = Targets.get(sessionId);
let id, ndx_id;
@ -689,7 +690,7 @@ export default Archivist;
const path = getFuzzyPath(basePath);
Fs.writeFileSync(
path,
JSON.stringify(docs)
JSON.stringify(docs, null, 2)
);
DEBUG && console.log(`Wrote fuzzy to ${path}`);
}

21
todo
View File

@ -1,13 +1,16 @@
- get snippets earlier (before rendering in lib server) and use to add to signal
- if we have multiple query terms (multiple determined by some form of tokenization) then try to show all terms present in the snippet. even tho one term may be higher scoring. Should we do multiple passes of ukkonen distance one for whole query and one for each term? This will be easier / faster with trigrams I guess. Basically we want snippet to be a relevant summary that provides signal.
- Another way to improve snippet highlight is to 'revert back' the highlighted text, and calculate their match/ukkonen on the query term. So e.g. if we get q:'israle beverly', hl:['beverly', 'beverly'], it's good overlap, but if we get hl:['is it really'] even tho that might score ok for israle, it's not a good match. so can we 'score that back' if we go match('is it really', 'israel') and see it is low, so we exclude it?
- implement trigram index
- try an exact match on the query term if possible for highlight. first one.
- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- this is still happening. sometimes the index is not saved, even on a normal error free restart. unknown why.
- linting
- Improve search page look
- search improvements
- use different min score options for different sources (noticed URL not match meghan highlight for hello mag even tho query got megan and did match and highlight queen in url)
- get snippets earlier (before rendering in lib server) and use to add to signal
- if we have multiple query terms (multiple determined by some form of tokenization) then try to show all terms present in the snippet. even tho one term may be higher scoring. Should we do multiple passes of ukkonen distance one for whole query and one for each term? This will be easier / faster with trigrams I guess. Basically we want snippet to be a relevant summary that provides signal.
- Another way to improve snippet highlight is to 'revert back' the highlighted text, and calculate their match/ukkonen on the query term. So e.g. if we get q:'israle beverly', hl:['beverly', 'beverly'], it's good overlap, but if we get hl:['is it really'] even tho that might score ok for israle, it's not a good match. so can we 'score that back' if we go match('is it really', 'israel') and see it is low, so we exclude it?
- implement trigram index
- try an exact match on the query term if possible for highlight. first one.
- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- this is still happening. sometimes the index is not saved, even on a normal error free restart. unknown why.
- We need to reload on localhost 22120 if we open with that
- We need to not open other localhosts if we already have one open
- ensure we are getting the page text to index once it is actually loaded (we should call again later, or add mutation observer and update on mutate)

132
x Normal file
View File

@ -0,0 +1,132 @@
/home/cris/Desktop/22120/app.js
92:19 error 'stdout' is assigned a value but never used no-unused-vars
92:27 error 'stderr' is assigned a value but never used no-unused-vars
/home/cris/Desktop/22120/archivist.js
29:17 error '_Fuzzy' is defined but never used no-unused-vars
34:5 error 'APP_ROOT' is defined but never used no-unused-vars
38:5 error 'SNIP_CONTEXT' is defined but never used no-unused-vars
43:11 error 'BLOCKED_BODY' is defined but never used no-unused-vars
47:25 error Unexpected control character(s) in regular expression: \x01, \x1a no-control-regex
52:25 error Unnecessary escape character: \/ no-useless-escape
69:29 error 'registerCharset' is assigned a value but never used no-unused-vars
69:46 error 'registerLanguage' is assigned a value but never used no-unused-vars
131:9 error 'IGNORE_NODES' is assigned a value but never used no-unused-vars
137:9 error 'TextNode' is assigned a value but never used no-unused-vars
138:9 error 'AttributeNode' is assigned a value but never used no-unused-vars
251:14 error 'guard' is defined but never used no-unused-vars
251:26 error 'text' is assigned a value but never used no-unused-vars
260:43 error 'context' is assigned a value but never used no-unused-vars
269:16 error Unnecessary semicolon no-extra-semi
272:16 error Unnecessary semicolon no-extra-semi
277:16 error Unnecessary semicolon no-extra-semi
281:19 error Empty block statement no-empty
289:48 error 'val' is not defined no-undef
294:28 error 'url' is assigned a value but never used no-unused-vars
303:14 error 'displayTargetInfo' is defined but never used no-unused-vars
348:62 error 'waitingForDebugger' is defined but never used no-unused-vars
390:63 error 'waitingForDebugger' is assigned a value but never used no-unused-vars
438:13 error 'res' is assigned a value but never used no-unused-vars
528:14 error 'url' is assigned a value but never used no-unused-vars
555:14 error Redundant double negation no-extra-boolean-cast
591:62 error 'message' is not defined no-undef
649:19 error 'urlFragment' is assigned a value but never used no-unused-vars
649:40 error 'headers' is assigned a value but never used no-unused-vars
649:49 error 'postData' is assigned a value but never used no-unused-vars
649:59 error 'hasPostData' is assigned a value but never used no-unused-vars
779:12 error Unnecessary semicolon no-extra-semi
797:12 error Unnecessary semicolon no-extra-semi
801:12 error Unnecessary semicolon no-extra-semi
928:22 error 'val' is defined but never used no-unused-vars
987:18 error 'id' is defined but never used no-unused-vars
1138:25 error 'DocumentIndex' is not defined no-undef
/home/cris/Desktop/22120/common.js
9:47 error 'window' is not defined no-undef
9:57 error 'window' is not defined no-undef
11:49 error 'chrome' is not defined no-undef
11:67 error 'chrome' is not defined no-undef
/home/cris/Desktop/22120/ext/bg_script.js
3:9 error 'say' is defined but never used no-unused-vars
8:8 error 'send' is assigned a value but never used no-unused-vars
8:14 error 'on' is assigned a value but never used no-unused-vars
/home/cris/Desktop/22120/ext/storage.js
9:3 error 'chrome' is not defined no-undef
20:3 error 'chrome' is not defined no-undef
21:10 error 'chrome' is not defined no-undef
22:13 error 'chrome' is not defined no-undef
/home/cris/Desktop/22120/highlighter.js
9:26 error 'chunkSize' is defined but never used no-unused-vars
87:3 error Unreachable code no-unreachable
123:10 error 'testHighlighter' is defined but never used no-unused-vars
/home/cris/Desktop/22120/index.js
1:1 error Read-only global 'require' should not be modified no-global-assign
/home/cris/Desktop/22120/lib/fuzzy.js
109:4 error Unnecessary semicolon no-extra-semi
/home/cris/Desktop/22120/lib/fz.js
159:22 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
160:34 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
167:21 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
167:53 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
199:30 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
268:31 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
359:16 error Unnecessary escape character: \- no-useless-escape
359:18 error Unnecessary escape character: \[ no-useless-escape
359:22 error Unnecessary escape character: \/ no-useless-escape
359:24 error Unnecessary escape character: \{ no-useless-escape
359:27 error Unnecessary escape character: \( no-useless-escape
359:29 error Unnecessary escape character: \) no-useless-escape
359:31 error Unnecessary escape character: \* no-useless-escape
359:33 error Unnecessary escape character: \+ no-useless-escape
359:35 error Unnecessary escape character: \? no-useless-escape
359:37 error Unnecessary escape character: \. no-useless-escape
359:41 error Unnecessary escape character: \^ no-useless-escape
359:43 error Unnecessary escape character: \$ no-useless-escape
359:45 error Unnecessary escape character: \| no-useless-escape
861:17 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
990:61 error Empty block statement no-empty
1110:83 error Empty block statement no-empty
1859:25 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
1877:25 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
2086:27 error Do not access Object.prototype method 'hasOwnProperty' from target object no-prototype-builtins
2123:38 error Unexpected control character(s) in regular expression: \x00 no-control-regex
2965:52 error 'suppress_cb' is defined but never used no-unused-vars
2965:65 error 'finally_cb' is defined but never used no-unused-vars
2971:22 error 'window' is not defined no-undef
2971:44 error 'window' is not defined no-undef
2971:70 error 'window' is not defined no-undef
3022:30 error 'a' is defined but never used no-unused-vars
3038:30 error 'a' is defined but never used no-unused-vars
3055:44 error 'define' is not defined no-undef
3058:5 error 'define' is not defined no-undef
/home/cris/Desktop/22120/libraryServer.js
1:8 error 'fs' is defined but never used no-unused-vars
6:64 error 'SNIP_CONTEXT' is defined but never used no-unused-vars
13:7 error 'INDEX_FILE' is assigned a value but never used no-unused-vars
48:10 error 'chrome_port' is assigned a value but never used no-unused-vars
53:8 error Redundant double negation no-extra-boolean-cast
/home/cris/Desktop/22120/protocol.js
6:7 error 'VERSION' is assigned a value but never used no-unused-vars
9:10 error 'promisify' is defined but never used no-unused-vars
22:12 error Redundant double negation no-extra-boolean-cast
69:5 error Move function declaration to function body root no-inner-declarations
88:5 error Move function declaration to function body root no-inner-declarations
98:22 error 'params' is assigned a value but never used no-unused-vars
140:5 error Move function declaration to function body root no-inner-declarations
148:5 error Move function declaration to function body root no-inner-declarations
156:5 error Move function declaration to function body root no-inner-declarations
160:5 error Move function declaration to function body root no-inner-declarations
161:25 error 'sessionId' is defined but never used no-unused-vars
✖ 106 problems (106 errors, 0 warnings)
10 errors and 0 warnings potentially fixable with the `--fix` option.