mirror of
https://github.com/c9fe/22120.git
synced 2024-11-10 04:52:43 +01:00
"Up todo. Segmenter progress."
This commit is contained in:
parent
2bb0de4058
commit
14b7256295
@ -1,4 +1,4 @@
|
||||
- implement trigram index run segmenter
|
||||
- trigram segmenter not working as expected. (e.g downey JR not getting correct on avenger endgame search page). fix
|
||||
- complete snippet generation
|
||||
- improve title boosting of ranks
|
||||
- save trigram index to disk
|
||||
|
@ -4,7 +4,7 @@ import {DEBUG} from './common.js';
|
||||
const MAX_ACCEPT_SCORE = 0.5;
|
||||
const CHUNK_SIZE = 12;
|
||||
|
||||
testHighlighter();
|
||||
//testHighlighter();
|
||||
|
||||
function params(qLength, chunkSize = CHUNK_SIZE) {
|
||||
const MaxDist = chunkSize;
|
||||
@ -95,6 +95,8 @@ export function trilight(query, doc, {
|
||||
/* 0 is no maxLength */
|
||||
maxLength: maxLength = 0,
|
||||
ngramSize: ngramSize = 3,
|
||||
minSegmentGap: minSegmentGap = 20,
|
||||
maxSegmentSize: maxSegmentSize = 140,
|
||||
} = {}) {
|
||||
query = Array.from(query);
|
||||
doc = Array.from(doc);
|
||||
@ -162,8 +164,52 @@ export function trilight(query, doc, {
|
||||
return G;
|
||||
}, []);
|
||||
gaps.sort(({gap:a}, {gap:b}) => a-b);
|
||||
const segments = [];
|
||||
const runSegMap = {};
|
||||
while(gaps.length) {
|
||||
const nextGap = gaps.shift();
|
||||
const leftSeg = runSegMap[nextGap.runs[0].di];
|
||||
const rightSeg = runSegMap[nextGap.runs[1].di];
|
||||
let newSegmentLength = 0;
|
||||
let assigned = false;
|
||||
if ( leftSeg ) {
|
||||
newSegmentLength = nextGap.runs[1].di + nextGap.runs[1].length - leftSeg.start;
|
||||
if ( newSegmentLength <= maxSegmentSize ) {
|
||||
leftSeg.end = nextGap.runs[1].di + nextGap.runs[1].length;
|
||||
runSegMap[nextGap.runs[1].di] = leftSeg;
|
||||
assigned = leftSeg;
|
||||
}
|
||||
} else if ( rightSeg ) {
|
||||
newSegmentLength = rightSeg.end - nextGap.runs[0].di;
|
||||
if ( newSegmentLength <= maxSegmentSize ) {
|
||||
rightSeg.start = nextGap.runs[0].di;
|
||||
runSegMap[nextGap.runs[0].di] = rightSeg;
|
||||
assigned = rightSeg;
|
||||
}
|
||||
} else {
|
||||
const newSegment = {
|
||||
start: nextGap.runs[0].di,
|
||||
end: nextGap.runs[0].di + nextGap.runs[0].length + nextGap.gap + nextGap.runs[1].length
|
||||
};
|
||||
if ( newSegment.end - newSegment.start <= maxSegmentSize ) {
|
||||
runSegMap[nextGap.runs[0].di] = newSegment;
|
||||
runSegMap[nextGap.runs[1].di] = newSegment;
|
||||
segments.push(newSegment);
|
||||
assigned = newSegment;
|
||||
newSegmentLength = newSegment.end - newSegment.start;
|
||||
}
|
||||
}
|
||||
if ( assigned ) {
|
||||
DEBUG && console.log('Assigned ', nextGap, 'to segment', assigned, 'now having length', newSegmentLength);
|
||||
} else {
|
||||
DEBUG && console.log('Gap ', nextGap, `could not be assigned as it would have made an existing
|
||||
as it would have made an existing segment too long, or it was already too long itself.`
|
||||
);
|
||||
}
|
||||
}
|
||||
const textSegments = segments.map(({start,end}) => doc.slice(start,end).join(''));
|
||||
//console.log(JSON.stringify({gaps}, null, 2));
|
||||
return [];
|
||||
return textSegments.slice(0,3);
|
||||
}
|
||||
|
||||
// returns a function that creates non-overlapping fragments
|
||||
@ -213,106 +259,106 @@ function getFragmenter(chunkSize, {overlap: overlap = false} = {}) {
|
||||
|
||||
|
||||
// tests
|
||||
function testHighlighter() {
|
||||
const query = 'metahead search';
|
||||
const doc = `
|
||||
Hacker News new | past | comments | ask | show | jobs | submit login
|
||||
1.
|
||||
AWS appears to be down again
|
||||
417 points by riknox 2 hours ago | hide | 260 comments
|
||||
2.
|
||||
FreeBSD Jails for Fun and Profit (topikettunen.com)
|
||||
42 points by kettunen 1 hour ago | hide | discuss
|
||||
3.
|
||||
IMF, 10 countries simulate cyber attack on global financial system (nasdaq.com)
|
||||
33 points by pueblito 1 hour ago | hide | 18 comments
|
||||
4.
|
||||
DNA seen through the eyes of a coder (berthub.eu)
|
||||
116 points by dunefox 3 hours ago | hide | 37 comments
|
||||
5.
|
||||
Pure Bash lightweight web server (github.com/remileduc)
|
||||
74 points by turrini 2 hours ago | hide | 46 comments
|
||||
6.
|
||||
Parser Combinators in Haskell (serokell.io)
|
||||
18 points by aroccoli 1 hour ago | hide | 3 comments
|
||||
7.
|
||||
DeepMind’s New AI with a Memory Outperforms Algorithms 25 Times Its Size (singularityhub.com)
|
||||
233 points by darkscape 9 hours ago | hide | 88 comments
|
||||
8.
|
||||
Tinder just permabanned me or the problem with big tech (paulefou.com)
|
||||
90 points by svalee 1 hour ago | hide | 106 comments
|
||||
9.
|
||||
Rocky Mountain Basic (wikipedia.org)
|
||||
12 points by mattowen_uk 1 hour ago | hide | 5 comments
|
||||
10.
|
||||
Teller Reveals His Secrets (2012) (smithsonianmag.com)
|
||||
56 points by Tomte 4 hours ago | hide | 26 comments
|
||||
11.
|
||||
Heroku Is Currently Down (heroku.com)
|
||||
129 points by iamricks 2 hours ago | hide | 29 comments
|
||||
12. Convictional (YC W19) is hiring engineers to build the future of B2B trade-Remote (ashbyhq.com)
|
||||
2 hours ago | hide
|
||||
13.
|
||||
Scientists find preserved dinosaur embryo preparing to hatch like a bird (theguardian.com)
|
||||
187 points by Petiver 9 hours ago | hide | 111 comments
|
||||
14.
|
||||
I did a Mixergy interview so bad they didn't even release it (robfitz.com)
|
||||
15 points by robfitz 1 hour ago | hide | 7 comments
|
||||
15.
|
||||
Now DuckDuckGo is building its own desktop browser (zdnet.com)
|
||||
132 points by waldekm 2 hours ago | hide | 64 comments
|
||||
16.
|
||||
English has been my pain for 15 years (2013) (antirez.com)
|
||||
105 points by Tomte 1 hour ago | hide | 169 comments
|
||||
17.
|
||||
Polish opposition duo hacked with NSO spyware (apnews.com)
|
||||
102 points by JumpCrisscross 2 hours ago | hide | 35 comments
|
||||
18.
|
||||
Linux Has Grown into a Viable PC Gaming Platform and the Steam Stats Prove It (hothardware.com)
|
||||
119 points by rbanffy 3 hours ago | hide | 105 comments
|
||||
19.
|
||||
LG’s new 16:18 monitor (theverge.com)
|
||||
50 points by tosh 1 hour ago | hide | 25 comments
|
||||
20.
|
||||
Construction of radio equipment in a Japanese PoW camp (bournemouth.ac.uk)
|
||||
117 points by marcodiego 9 hours ago | hide | 16 comments
|
||||
21.
|
||||
Everything I've seen on optimizing Postgres on ZFS (vadosware.io)
|
||||
27 points by EntICOnc 4 hours ago | hide | 2 comments
|
||||
22.
|
||||
Microsoft Teams: 1 feature, 4 vulnerabilities (positive.security)
|
||||
269 points by kerm1t 4 hours ago | hide | 196 comments
|
||||
23.
|
||||
Analog computers were the most powerful computers for thousands of years [video] (youtube.com)
|
||||
103 points by jdkee 9 hours ago | hide | 55 comments
|
||||
24.
|
||||
Shipwrecks, Stolen Jewels, Skull-Blasting Are Some of This Year’s Best Mysteries (atlasobscura.com)
|
||||
8 points by CapitalistCartr 1 hour ago | hide | 1 comment
|
||||
25.
|
||||
Isolating Xwayland in a VM (roscidus.com)
|
||||
94 points by pmarin 9 hours ago | hide | 32 comments
|
||||
26.
|
||||
Show HN: Metaheads, a search engine for Facebook comments (metaheads.xyz)
|
||||
4 points by jawerty 1 hour ago | hide | 15 comments
|
||||
27.
|
||||
Quantum theory based on real numbers can be experimentally falsified (nature.com)
|
||||
159 points by SquibblesRedux 14 hours ago | hide | 93 comments
|
||||
28.
|
||||
Founder of Black Girls Code has been ousted as head of the nonprofit (businessinsider.com)
|
||||
29 points by healsdata 1 hour ago | hide | 7 comments
|
||||
29.
|
||||
Waffle House Poet Laureate (2019) (atlantamagazine.com)
|
||||
5 points by brudgers 1 hour ago | hide | 4 comments
|
||||
30.
|
||||
Earth’s magnetic field illuminates Biblical history (economist.com)
|
||||
46 points by helsinkiandrew 8 hours ago | hide | 17 comments
|
||||
More
|
||||
`;
|
||||
/*
|
||||
console.log(JSON.stringify(highlight(
|
||||
query, doc
|
||||
).map(({fragment:{text,offset}}) => offset + ':' + text), null, 2));
|
||||
*/
|
||||
/*
|
||||
function testHighlighter() {
|
||||
const query = 'metahead search';
|
||||
const doc = `
|
||||
Hacker News new | past | comments | ask | show | jobs | submit login
|
||||
1.
|
||||
AWS appears to be down again
|
||||
417 points by riknox 2 hours ago | hide | 260 comments
|
||||
2.
|
||||
FreeBSD Jails for Fun and Profit (topikettunen.com)
|
||||
42 points by kettunen 1 hour ago | hide | discuss
|
||||
3.
|
||||
IMF, 10 countries simulate cyber attack on global financial system (nasdaq.com)
|
||||
33 points by pueblito 1 hour ago | hide | 18 comments
|
||||
4.
|
||||
DNA seen through the eyes of a coder (berthub.eu)
|
||||
116 points by dunefox 3 hours ago | hide | 37 comments
|
||||
5.
|
||||
Pure Bash lightweight web server (github.com/remileduc)
|
||||
74 points by turrini 2 hours ago | hide | 46 comments
|
||||
6.
|
||||
Parser Combinators in Haskell (serokell.io)
|
||||
18 points by aroccoli 1 hour ago | hide | 3 comments
|
||||
7.
|
||||
DeepMind’s New AI with a Memory Outperforms Algorithms 25 Times Its Size (singularityhub.com)
|
||||
233 points by darkscape 9 hours ago | hide | 88 comments
|
||||
8.
|
||||
Tinder just permabanned me or the problem with big tech (paulefou.com)
|
||||
90 points by svalee 1 hour ago | hide | 106 comments
|
||||
9.
|
||||
Rocky Mountain Basic (wikipedia.org)
|
||||
12 points by mattowen_uk 1 hour ago | hide | 5 comments
|
||||
10.
|
||||
Teller Reveals His Secrets (2012) (smithsonianmag.com)
|
||||
56 points by Tomte 4 hours ago | hide | 26 comments
|
||||
11.
|
||||
Heroku Is Currently Down (heroku.com)
|
||||
129 points by iamricks 2 hours ago | hide | 29 comments
|
||||
12. Convictional (YC W19) is hiring engineers to build the future of B2B trade-Remote (ashbyhq.com)
|
||||
2 hours ago | hide
|
||||
13.
|
||||
Scientists find preserved dinosaur embryo preparing to hatch like a bird (theguardian.com)
|
||||
187 points by Petiver 9 hours ago | hide | 111 comments
|
||||
14.
|
||||
I did a Mixergy interview so bad they didn't even release it (robfitz.com)
|
||||
15 points by robfitz 1 hour ago | hide | 7 comments
|
||||
15.
|
||||
Now DuckDuckGo is building its own desktop browser (zdnet.com)
|
||||
132 points by waldekm 2 hours ago | hide | 64 comments
|
||||
16.
|
||||
English has been my pain for 15 years (2013) (antirez.com)
|
||||
105 points by Tomte 1 hour ago | hide | 169 comments
|
||||
17.
|
||||
Polish opposition duo hacked with NSO spyware (apnews.com)
|
||||
102 points by JumpCrisscross 2 hours ago | hide | 35 comments
|
||||
18.
|
||||
Linux Has Grown into a Viable PC Gaming Platform and the Steam Stats Prove It (hothardware.com)
|
||||
119 points by rbanffy 3 hours ago | hide | 105 comments
|
||||
19.
|
||||
LG’s new 16:18 monitor (theverge.com)
|
||||
50 points by tosh 1 hour ago | hide | 25 comments
|
||||
20.
|
||||
Construction of radio equipment in a Japanese PoW camp (bournemouth.ac.uk)
|
||||
117 points by marcodiego 9 hours ago | hide | 16 comments
|
||||
21.
|
||||
Everything I've seen on optimizing Postgres on ZFS (vadosware.io)
|
||||
27 points by EntICOnc 4 hours ago | hide | 2 comments
|
||||
22.
|
||||
Microsoft Teams: 1 feature, 4 vulnerabilities (positive.security)
|
||||
269 points by kerm1t 4 hours ago | hide | 196 comments
|
||||
23.
|
||||
Analog computers were the most powerful computers for thousands of years [video] (youtube.com)
|
||||
103 points by jdkee 9 hours ago | hide | 55 comments
|
||||
24.
|
||||
Shipwrecks, Stolen Jewels, Skull-Blasting Are Some of This Year’s Best Mysteries (atlasobscura.com)
|
||||
8 points by CapitalistCartr 1 hour ago | hide | 1 comment
|
||||
25.
|
||||
Isolating Xwayland in a VM (roscidus.com)
|
||||
94 points by pmarin 9 hours ago | hide | 32 comments
|
||||
26.
|
||||
Show HN: Metaheads, a search engine for Facebook comments (metaheads.xyz)
|
||||
4 points by jawerty 1 hour ago | hide | 15 comments
|
||||
27.
|
||||
Quantum theory based on real numbers can be experimentally falsified (nature.com)
|
||||
159 points by SquibblesRedux 14 hours ago | hide | 93 comments
|
||||
28.
|
||||
Founder of Black Girls Code has been ousted as head of the nonprofit (businessinsider.com)
|
||||
29 points by healsdata 1 hour ago | hide | 7 comments
|
||||
29.
|
||||
Waffle House Poet Laureate (2019) (atlantamagazine.com)
|
||||
5 points by brudgers 1 hour ago | hide | 4 comments
|
||||
30.
|
||||
Earth’s magnetic field illuminates Biblical history (economist.com)
|
||||
46 points by helsinkiandrew 8 hours ago | hide | 17 comments
|
||||
More
|
||||
`;
|
||||
|
||||
trilight('metahead search', doc.toLocaleLowerCase().replace(/\s+/g, ' '));
|
||||
}
|
||||
console.log(JSON.stringify(highlight(
|
||||
query, doc
|
||||
).map(({fragment:{text,offset}}) => offset + ':' + text), null, 2));
|
||||
console.log(trilight('metahead search', doc.toLocaleLowerCase().replace(/\s+/g, ' ')));
|
||||
}
|
||||
*/
|
||||
|
@ -7,7 +7,7 @@ import {
|
||||
say, sleep, APP_ROOT
|
||||
} from './common.js';
|
||||
import Archivist from './archivist.js';
|
||||
import {highlight} from './highlighter.js';
|
||||
import {trilight, highlight} from './highlighter.js';
|
||||
|
||||
const SITE_PATH = path.resolve(APP_ROOT, '..', 'public');
|
||||
|
||||
@ -63,10 +63,15 @@ function addHandlers() {
|
||||
}, null, 2));
|
||||
} else {
|
||||
results.forEach(r => {
|
||||
/*
|
||||
r.snippet = '... ' + highlight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH})
|
||||
.sort(({fragment:{offset:a}}, {fragment:{offset:b}}) => a-b)
|
||||
.map(hl => Archivist.findOffsets(query, hl.fragment.text))
|
||||
.join(' ... ');
|
||||
*/
|
||||
r.snippet = '... ' + trilight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH})
|
||||
.map(segment => Archivist.findOffsets(query, segment))
|
||||
.join(' ... ');
|
||||
});
|
||||
res.end(SearchResultView({results, query, HL}));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user