1
0
mirror of https://github.com/c9fe/22120.git synced 2024-11-10 04:52:43 +01:00

"At this point. MARK. The search snippets result highlighting and stability of 22120 look pretty good. Still much to improve but this is a good mark place."

This commit is contained in:
Cris Stringfellow 2021-12-25 16:19:04 +08:00
parent 963a750878
commit c1e2539abf
10 changed files with 50 additions and 110 deletions

View File

@ -90,7 +90,7 @@
};
const HIGHLIGHT_OPTIONS_FUZZY = {
minimum_match: 3.0
minimum_match: 2.0
};
const FUZZ_OPTS = {
keys: ndxDocFields({namesOnly:true})
@ -721,6 +721,7 @@ export default Archivist;
try {
State.Cache = new Map(JSON.parse(Fs.readFileSync(cacheFile)));
} catch(e) {
console.warn(e+'');
State.Cache = new Map();
someError = true;
}
@ -728,6 +729,7 @@ export default Archivist;
try {
State.Index = new Map(JSON.parse(Fs.readFileSync(indexFile)));
} catch(e) {
console.warn(e+'');
State.Index = new Map();
someError = true;
}
@ -742,18 +744,21 @@ export default Archivist;
});
DEBUG && console.log('Flex loaded');
} catch(e) {
console.warn(e+'');
someError = true;
}
try {
loadNDXIndex(NDX_FTSIndex);
} catch(e) {
console.warn(e+'');
someError = true;
}
try {
loadFuzzy();
} catch(e) {
console.warn(e+'');
someError = true;
}
@ -807,7 +812,7 @@ export default Archivist;
}
Id = Math.round(State.Index.size / 2) + 3;
NDXId = State.Index.has(NDX_ID_KEY) ? State.Index.get(NDX_ID_KEY) + 3000 : (Id + 1000000);
NDXId = State.Index.has(NDX_ID_KEY) ? State.Index.get(NDX_ID_KEY) + 1003000 : (Id + 1000000);
if ( !Number.isInteger(NDXId) ) NDXId = Id;
DEBUG && console.log({firstFreeId: Id, firstFreeNDXId: NDXId});
@ -840,6 +845,7 @@ export default Archivist;
function saveFiles({useState: useState = false, forceSave:forceSave = false} = {}) {
clearSavers();
if ( State.Index.size === 0 ) return;
State.Index.set(NDX_ID_KEY, NDXId);
if ( useState ) {
// saves the old cache path

View File

@ -27,6 +27,7 @@ export function highlight(query, doc, {
doc = doc.slice(0, maxLength);
}
const highlights = [];
const extra = chunkSize;
// use array from then length rather than string length to
// give accurate length for all unicode
const qLength = Array.from(query).length;
@ -67,7 +68,6 @@ export function highlight(query, doc, {
let better = Array.from(highlights).slice(0, 10);
better = better.map(hl => {
const length = Array.from(hl.fragment.text).length;
const extra = Math.round(length/2);
let {offset, symbols} = hl.fragment;
const newText = symbols.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + symbols.slice(offset + length, offset + length + extra).join('');
DEBUG && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')});

View File

@ -64,7 +64,7 @@ function addHandlers() {
}, null, 2));
} else {
results.forEach(r => {
r.snippet = highlight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH})
r.snippet = '... ' + highlight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH})
.sort(({fragment:{offset:a}}, {fragment:{offset:b}}) => a-b)
.map(hl => Archivist.findOffsets(query, hl.fragment.text))
.join(' ... ');
@ -134,40 +134,7 @@ function IndexView(urls) {
<!DOCTYPE html>
<meta charset=utf-8>
<title>Your HTML Library</title>
<style>
:root {
font-family: sans-serif;
background: lavenderblush;
}
body {
display: table;
margin: 0 auto;
background: silver;
padding: 0.5em;
box-shadow: 0 1px 1px purple;
}
form {
}
fieldset {
border: thin solid purple;
}
button, input, output {
}
input.long {
width: 100%;
min-width: 250px;
}
output {
font-size: smaller;
color: purple;
}
h1 {
margin: 0;
}
h2 {
margin-top: 0;
}
</style>
<link rel=stylesheet href=/style.css>
<h1><a href=/>22120</a></h1>
<h2>Internet Offline Library</h2>
<h2>Archive Index</h2>
@ -195,43 +162,7 @@ function SearchResultView({results, query, HL}) {
<!DOCTYPE html>
<meta charset=utf-8>
<title>${query} - 22120 search results</title>
<style>
:root {
font-family: sans-serif;
background: lavenderblush;
}
body {
display: table;
margin: 0 auto;
background: silver;
padding: 0.5em;
box-shadow: 0 1px 1px purple;
}
form {
}
fieldset {
border: thin solid purple;
}
button, input, output {
}
input.long {
width: 100%;
min-width: 250px;
}
output {
font-size: smaller;
color: purple;
}
h1 {
margin: 0;
}
h2 {
margin-top: 0;
}
small.url {
word-break: break-all;
}
</style>
<link rel=stylesheet href=/style.css>
<h1><a href=/>22120</a></h1>
<h2>Search results</h2>
Or view <a href=/archive_index.html>your index</a>.

View File

@ -1,40 +1,7 @@
<!DOCTYPE html>
<meta charset=utf-8>
<title>Your HTML Library</title>
<style>
:root {
font-family: sans-serif;
background: lavenderblush;
}
body {
display: table;
margin: 0 auto;
background: silver;
padding: 0.5em;
box-shadow: 0 1px 1px purple;
}
form {
}
fieldset {
border: thin solid purple;
}
button, input, output {
}
input.long {
width: 100%;
min-width: 250px;
}
output {
font-size: smaller;
color: purple;
}
h1 {
margin: 0;
}
h2 {
margin-top: 0;
}
</style>
<link rel=stylesheet href=/style.css>
<h1><a href=/>22120</a></h1>
<h2>Internet Offline Library</h2>
<p>

36
public/style.css Normal file
View File

@ -0,0 +1,36 @@
:root {
font-family: sans-serif;
background: lavenderblush;
}
body {
display: table;
margin: 0 auto;
background: silver;
padding: 0.5em;
box-shadow: 0 1px 1px purple;
}
form {
}
fieldset {
border: thin solid purple;
}
button, input, output {
}
input.long {
width: 100%;
min-width: 250px;
}
output {
font-size: smaller;
color: purple;
}
h1 {
margin: 0;
}
h2 {
margin-top: 0;
}
small.url {
word-break: break-all;
}

2
todo
View File

@ -1,11 +1,11 @@
- linting
- Improve search page look
- search improvements
- implement trigram index
- use different min score options for different sources (noticed URL not match meghan highlight for hello mag even tho query got megan and did match and highlight queen in url)
- get snippets earlier (before rendering in lib server) and use to add to signal
- if we have multiple query terms (multiple determined by some form of tokenization) then try to show all terms present in the snippet. even tho one term may be higher scoring. Should we do multiple passes of ukkonen distance one for whole query and one for each term? This will be easier / faster with trigrams I guess. Basically we want snippet to be a relevant summary that provides signal.
- Another way to improve snippet highlight is to 'revert back' the highlighted text, and calculate their match/ukkonen on the query term. So e.g. if we get q:'israle beverly', hl:['beverly', 'beverly'], it's good overlap, but if we get hl:['is it really'] even tho that might score ok for israle, it's not a good match. so can we 'score that back' if we go match('is it really', 'israel') and see it is low, so we exclude it?
- implement trigram index
- try an exact match on the query term if possible for highlight. first one.
- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))