From ac74c809ebd7f0725aac4173ab586b59c612490f Mon Sep 17 00:00:00 2001 From: Cris Stringfellow <22254235+crislin2046@users.noreply.github.com> Date: Sat, 25 Dec 2021 13:47:21 +0800 Subject: [PATCH] "Fixed the windowed highlight bug, where incorrect offsets were occuring. Now using correct source array to remove that problem. Also improved highlight matching by adding a score (minima match) to fuzzy.options (3, versus default 1)." --- archivist.js | 9 +++++++++ highlighter.js | 10 +++++----- todo | 7 +------ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/archivist.js b/archivist.js index 2e141a0..465a7e0 100644 --- a/archivist.js +++ b/archivist.js @@ -84,6 +84,13 @@ DEBUG && console.log({NDX_FTSIndex}); // fuzzy (maybe just for queries ?) + const REGULAR_SEARCH_OPTIONS_FUZZY = { + minimum_match: 1.0 + }; + + const HIGHLIGHT_OPTIONS_FUZZY = { + minimum_match: 3.0 + }; const FUZZ_OPTS = { keys: ndxDocFields({namesOnly:true}) }; @@ -864,6 +871,7 @@ export default Archivist; if ( maxLength ) { doc = Array.from(doc).slice(0, maxLength).join(''); } + Object.assign(fuzzy.options, HIGHLIGHT_OPTIONS_FUZZY); const hl = fuzzy.highlight(doc); DEBUG && console.log(query, hl); return hl; @@ -931,6 +939,7 @@ export default Archivist; url: State.Index.get('ndx'+r.key), score: r.score })); + Object.assign(fuzzy.options, REGULAR_SEARCH_OPTIONS_FUZZY); const fuzzRaw = fuzzy.search(query); const fuzz = processFuzzResults(fuzzRaw); diff --git a/highlighter.js b/highlighter.js index 2863af5..86c6c50 100644 --- a/highlighter.js +++ b/highlighter.js @@ -64,13 +64,13 @@ export function highlight(query, doc, { console.log('Zero highlights, showing first score', scores[0]); return scores.slice(0,1); } else { - let better = JSON.parse(JSON.stringify(highlights)).slice(0, 10); + let better = Array.from(highlights).slice(0, 10); better = better.map(hl => { const length = Array.from(hl.fragment.text).length; const extra = Math.round(length/2); - let {offset} = hl.fragment; - const newText = doc.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + doc.slice(offset + length, offset + length + extra).join(''); - //console.log({newText, oldText:hl.fragment.text}); + let {offset, symbols} = hl.fragment; + const newText = symbols.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + symbols.slice(offset + length, offset + length + extra).join(''); + DEBUG && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')}); hl.fragment.text = newText; const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length); const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist); @@ -106,7 +106,7 @@ function getFragmenter(chunkSize) { currentFrag = frags.pop(); currentFrag.text += nextSymbol; } else { - currentFrag = {text:nextSymbol, offset:index}; + currentFrag = {text:nextSymbol, offset:index, symbols}; currentLength = 0; } currentLength++; diff --git a/todo b/todo index 0225f22..dfbba04 100644 --- a/todo +++ b/todo @@ -1,10 +1,5 @@ -- highlights are mostly rubbish right now +- implement trigram index - try an exact match on the query term if possible for highlight. first one. -- don't highlight small matches like: - - search: Zuckerberg, top result: Hacker News - Top Links - - highlight Hacker News - - WTF come on... I need a threshold on this stuff....or like, if I can find a good match in - body then don't highlight worse match in title...or maybe I can use ukkonen as part of threshold - we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible) - Create instant search (or at least instant queries (so search over previous queries -- not results necessarily)) - an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this