1
0
mirror of https://github.com/c9fe/22120.git synced 2024-11-10 04:52:43 +01:00

"Fixed bug with url HTML highlight truncation, and made small improvements: sorted by most recent first in index view, add a link to index from search page."

This commit is contained in:
Cris Stringfellow 2021-12-24 20:17:14 +08:00
parent e96d1bf74b
commit a3be9b92be
5 changed files with 37 additions and 16 deletions

View File

@ -10,7 +10,7 @@
import Path from 'path';
import os from 'os';
import Fs from 'fs';
import { stdin as input, stdout as output } from 'process';
import {stdin as input, stdout as output} from 'process';
import util from 'util';
import readline from 'readline';
@ -28,12 +28,12 @@
//import Fuzzy from 'fz-search';
import * as _Fuzzy from './lib/fz.js';
import Nat from 'natural';
//import match from 'autosuggest-highlight/match';
//import parse from 'autosuggest-highlight/parse';
import args from './args.js';
import {
APP_ROOT, context, sleep, DEBUG,
MAX_TITLE_LENGTH,
MAX_URL_LENGTH,
clone,
SNIP_CONTEXT,
CHECK_INTERVAL, TEXT_NODE, FORBIDDEN_TEXT_PARENT
@ -417,7 +417,7 @@ export default Archivist;
id = Id;
}
const doc = toNDXDoc({id, url, title, pageText});
State.Index.set(url, {id:doc.id, ndx_id:doc.ndx_id, title});
State.Index.set(url, {date:Date.now(),id:doc.id, ndx_id:doc.ndx_id, title});
State.Index.set(doc.id, url);
State.Index.set('ndx'+doc.ndx_id, url);
@ -860,7 +860,10 @@ export default Archivist;
return {url, title, id, content};
}
function findOffsets(query, doc, count = 0) {
function findOffsets(query, doc, maxLength = 0) {
if ( maxLength ) {
doc = Array.from(doc).slice(0, maxLength).join('');
}
const hl = fuzzy.highlight(doc);
DEBUG && console.log(query, hl);
return hl;
@ -912,8 +915,11 @@ export default Archivist;
}
function getIndex() {
return JSON.parse(Fs.readFileSync(INDEX_FILE()))
.filter(([key, val]) => typeof key === 'string' && !hiddenKey(key));
const idx = JSON.parse(Fs.readFileSync(INDEX_FILE()))
.filter(([key, val]) => typeof key === 'string' && !hiddenKey(key))
.sort(([,{date:a}], [,{date:b}]) => b-a);
DEBUG && console.log(idx);
return idx;
}
async function search(query) {
@ -936,8 +942,8 @@ export default Archivist;
const title = State.Index.get(obj.url)?.title;
return {
id: obj.id,
url: Archivist.findOffsets(query, obj.url) || obj.url,
title: Archivist.findOffsets(query, title) || title,
url: Archivist.findOffsets(query, obj.url, MAX_URL_LENGTH) || obj.url,
title: Archivist.findOffsets(query, title, MAX_TITLE_LENGTH) || title,
};
});
highlights.forEach(hl => HL.set(hl.id, hl));

View File

@ -26,6 +26,11 @@ export const SHOW_FETCH = false;
export const CHECK_INTERVAL = 400;
export const TEXT_NODE = 3;
export const MAX_HIGHLIGHTABLE_LENGTH = 0; /* 0 is no max length for highlight */
export const MAX_TITLE_LENGTH = 140;
export const MAX_URL_LENGTH = 140;
/* text nodes inside these elements that are ignored */
export const FORBIDDEN_TEXT_PARENT = new Set([
'STYLE',
'SCRIPT',

View File

@ -1,9 +1,10 @@
import ukkonen from 'ukkonen';
import {DEBUG} from './common.js';
const MAX_ACCEPT_SCORE = 0.5;
const CHUNK_SIZE = 24;
testHighlighter();
//testHighlighter();
function params(qLength, chunkSize) {
const MaxDist = CHUNK_SIZE;
@ -13,10 +14,15 @@ function params(qLength, chunkSize) {
}
export function highlight(query, doc, {
/* 0 is no maxLength */
maxLength: maxLength = 0,
maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE,
chunkSize: chunkSize = CHUNK_SIZE
} = {}) {
doc = Array.from(doc);
if ( maxLength ) {
doc = doc.slice(0, maxLength);
}
const highlights = [];
// use array from then length rather than string length to
// give accurate length for all unicode
@ -24,7 +30,7 @@ export function highlight(query, doc, {
const {MaxDist,MinScore,MaxScore} = params(qLength, chunkSize);
const fragments = doc.reduce(getFragmenter(chunkSize), []);
query.toLocaleLowerCase();
console.log(fragments);
DEBUG && console.log(fragments);
const scores = fragments.map(fragment => {
const distance = ukkonen(query, fragment.text.toLocaleLowerCase(), MaxDist);
@ -63,7 +69,7 @@ export function highlight(query, doc, {
return hl;
});
better.sort(({score:a}, {score:b}) => a-b);
console.log(JSON.stringify({better},null,2));
DEBUG && console.log(JSON.stringify({better},null,2));
return better.slice(0,3);
}

View File

@ -3,7 +3,7 @@ import path from 'path';
import express from 'express';
import args from './args.js';
import {DEBUG, say, sleep, APP_ROOT, SNIP_CONTEXT} from './common.js';
import {MAX_HIGHLIGHTABLE_LENGTH, DEBUG, say, sleep, APP_ROOT, SNIP_CONTEXT} from './common.js';
import Archivist from './archivist.js';
import {highlight} from './highlighter.js';
@ -65,7 +65,7 @@ function addHandlers() {
} else {
results.forEach(r => {
r.snippet = Archivist.findOffsets(query,
highlight(query, r.content).map(hl => hl.fragment.text).join('…')
highlight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH}).map(hl => hl.fragment.text).join(' ... ')
);
});
res.end(SearchResultView({results, query, HL}));
@ -227,9 +227,13 @@ function SearchResultView({results, query, HL}) {
h2 {
margin-top: 0;
}
small.url {
word-break: break-all;
}
</style>
<h1><a href=/>22120</a></h1>
<h2>Search results</h2>
Or view <a href=/archive_index.html>your index</a>.
<form method=GET action=/search>
<fieldset>
<legend>Search again</legend>
@ -246,7 +250,7 @@ function SearchResultView({results, query, HL}) {
<li>
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
<br>
<small>${(HL.get(id)?.url||url).slice(0,128)}</small>
<small class=url>${(HL.get(id)?.url||url)}</small>
<p>${snippet}</p>
</li>
`).join('\n')

2
todo
View File

@ -1,4 +1,3 @@
- fix 0 score for unrelated framgnets, or neg score for short guys.
- don't highlight small matches like:
- search: Zuckerberg, top result: Hacker News - Top Links
- highlight Ha<strong>cker</strong> News
@ -7,6 +6,7 @@
- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- this is still happening. sometimes the index is not saved, even on a normal error free restart. unknown why.
- Improve search page look
- We need to reload on localhost 22120 if we open with that
- We need to not open other localhosts if we already have one open