1
0
mirror of https://github.com/c9fe/22120.git synced 2024-11-10 04:52:43 +01:00

"Up todo. Segmenter progress."

This commit is contained in:
Cris Stringfellow 2021-12-27 17:38:45 +08:00
parent 2bb0de4058
commit 14b7256295
3 changed files with 157 additions and 106 deletions

View File

@ -1,4 +1,4 @@
- implement trigram index run segmenter
- trigram segmenter not working as expected. (e.g downey JR not getting correct on avenger endgame search page). fix
- complete snippet generation
- improve title boosting of ranks
- save trigram index to disk

View File

@ -4,7 +4,7 @@ import {DEBUG} from './common.js';
const MAX_ACCEPT_SCORE = 0.5;
const CHUNK_SIZE = 12;
testHighlighter();
//testHighlighter();
function params(qLength, chunkSize = CHUNK_SIZE) {
const MaxDist = chunkSize;
@ -95,6 +95,8 @@ export function trilight(query, doc, {
/* 0 is no maxLength */
maxLength: maxLength = 0,
ngramSize: ngramSize = 3,
minSegmentGap: minSegmentGap = 20,
maxSegmentSize: maxSegmentSize = 140,
} = {}) {
query = Array.from(query);
doc = Array.from(doc);
@ -162,8 +164,52 @@ export function trilight(query, doc, {
return G;
}, []);
gaps.sort(({gap:a}, {gap:b}) => a-b);
const segments = [];
const runSegMap = {};
while(gaps.length) {
const nextGap = gaps.shift();
const leftSeg = runSegMap[nextGap.runs[0].di];
const rightSeg = runSegMap[nextGap.runs[1].di];
let newSegmentLength = 0;
let assigned = false;
if ( leftSeg ) {
newSegmentLength = nextGap.runs[1].di + nextGap.runs[1].length - leftSeg.start;
if ( newSegmentLength <= maxSegmentSize ) {
leftSeg.end = nextGap.runs[1].di + nextGap.runs[1].length;
runSegMap[nextGap.runs[1].di] = leftSeg;
assigned = leftSeg;
}
} else if ( rightSeg ) {
newSegmentLength = rightSeg.end - nextGap.runs[0].di;
if ( newSegmentLength <= maxSegmentSize ) {
rightSeg.start = nextGap.runs[0].di;
runSegMap[nextGap.runs[0].di] = rightSeg;
assigned = rightSeg;
}
} else {
const newSegment = {
start: nextGap.runs[0].di,
end: nextGap.runs[0].di + nextGap.runs[0].length + nextGap.gap + nextGap.runs[1].length
};
if ( newSegment.end - newSegment.start <= maxSegmentSize ) {
runSegMap[nextGap.runs[0].di] = newSegment;
runSegMap[nextGap.runs[1].di] = newSegment;
segments.push(newSegment);
assigned = newSegment;
newSegmentLength = newSegment.end - newSegment.start;
}
}
if ( assigned ) {
DEBUG && console.log('Assigned ', nextGap, 'to segment', assigned, 'now having length', newSegmentLength);
} else {
DEBUG && console.log('Gap ', nextGap, `could not be assigned as it would have made an existing
as it would have made an existing segment too long, or it was already too long itself.`
);
}
}
const textSegments = segments.map(({start,end}) => doc.slice(start,end).join(''));
//console.log(JSON.stringify({gaps}, null, 2));
return [];
return textSegments.slice(0,3);
}
// returns a function that creates non-overlapping fragments
@ -213,106 +259,106 @@ function getFragmenter(chunkSize, {overlap: overlap = false} = {}) {
// tests
function testHighlighter() {
const query = 'metahead search';
const doc = `
Hacker News new | past | comments | ask | show | jobs | submit login
1.
AWS appears to be down again
417 points by riknox 2 hours ago | hide | 260 comments
2.
FreeBSD Jails for Fun and Profit (topikettunen.com)
42 points by kettunen 1 hour ago | hide | discuss
3.
IMF, 10 countries simulate cyber attack on global financial system (nasdaq.com)
33 points by pueblito 1 hour ago | hide | 18 comments
4.
DNA seen through the eyes of a coder (berthub.eu)
116 points by dunefox 3 hours ago | hide | 37 comments
5.
Pure Bash lightweight web server (github.com/remileduc)
74 points by turrini 2 hours ago | hide | 46 comments
6.
Parser Combinators in Haskell (serokell.io)
18 points by aroccoli 1 hour ago | hide | 3 comments
7.
DeepMinds New AI with a Memory Outperforms Algorithms 25 Times Its Size (singularityhub.com)
233 points by darkscape 9 hours ago | hide | 88 comments
8.
Tinder just permabanned me or the problem with big tech (paulefou.com)
90 points by svalee 1 hour ago | hide | 106 comments
9.
Rocky Mountain Basic (wikipedia.org)
12 points by mattowen_uk 1 hour ago | hide | 5 comments
10.
Teller Reveals His Secrets (2012) (smithsonianmag.com)
56 points by Tomte 4 hours ago | hide | 26 comments
11.
Heroku Is Currently Down (heroku.com)
129 points by iamricks 2 hours ago | hide | 29 comments
12. Convictional (YC W19) is hiring engineers to build the future of B2B trade-Remote (ashbyhq.com)
2 hours ago | hide
13.
Scientists find preserved dinosaur embryo preparing to hatch like a bird (theguardian.com)
187 points by Petiver 9 hours ago | hide | 111 comments
14.
I did a Mixergy interview so bad they didn't even release it (robfitz.com)
15 points by robfitz 1 hour ago | hide | 7 comments
15.
Now DuckDuckGo is building its own desktop browser (zdnet.com)
132 points by waldekm 2 hours ago | hide | 64 comments
16.
English has been my pain for 15 years (2013) (antirez.com)
105 points by Tomte 1 hour ago | hide | 169 comments
17.
Polish opposition duo hacked with NSO spyware (apnews.com)
102 points by JumpCrisscross 2 hours ago | hide | 35 comments
18.
Linux Has Grown into a Viable PC Gaming Platform and the Steam Stats Prove It (hothardware.com)
119 points by rbanffy 3 hours ago | hide | 105 comments
19.
LGs new 16:18 monitor (theverge.com)
50 points by tosh 1 hour ago | hide | 25 comments
20.
Construction of radio equipment in a Japanese PoW camp (bournemouth.ac.uk)
117 points by marcodiego 9 hours ago | hide | 16 comments
21.
Everything I've seen on optimizing Postgres on ZFS (vadosware.io)
27 points by EntICOnc 4 hours ago | hide | 2 comments
22.
Microsoft Teams: 1 feature, 4 vulnerabilities (positive.security)
269 points by kerm1t 4 hours ago | hide | 196 comments
23.
Analog computers were the most powerful computers for thousands of years [video] (youtube.com)
103 points by jdkee 9 hours ago | hide | 55 comments
24.
Shipwrecks, Stolen Jewels, Skull-Blasting Are Some of This Years Best Mysteries (atlasobscura.com)
8 points by CapitalistCartr 1 hour ago | hide | 1 comment
25.
Isolating Xwayland in a VM (roscidus.com)
94 points by pmarin 9 hours ago | hide | 32 comments
26.
Show HN: Metaheads, a search engine for Facebook comments (metaheads.xyz)
4 points by jawerty 1 hour ago | hide | 15 comments
27.
Quantum theory based on real numbers can be experimentally falsified (nature.com)
159 points by SquibblesRedux 14 hours ago | hide | 93 comments
28.
Founder of Black Girls Code has been ousted as head of the nonprofit (businessinsider.com)
29 points by healsdata 1 hour ago | hide | 7 comments
29.
Waffle House Poet Laureate (2019) (atlantamagazine.com)
5 points by brudgers 1 hour ago | hide | 4 comments
30.
Earths magnetic field illuminates Biblical history (economist.com)
46 points by helsinkiandrew 8 hours ago | hide | 17 comments
More
`;
/*
console.log(JSON.stringify(highlight(
query, doc
).map(({fragment:{text,offset}}) => offset + ':' + text), null, 2));
*/
/*
function testHighlighter() {
const query = 'metahead search';
const doc = `
Hacker News new | past | comments | ask | show | jobs | submit login
1.
AWS appears to be down again
417 points by riknox 2 hours ago | hide | 260 comments
2.
FreeBSD Jails for Fun and Profit (topikettunen.com)
42 points by kettunen 1 hour ago | hide | discuss
3.
IMF, 10 countries simulate cyber attack on global financial system (nasdaq.com)
33 points by pueblito 1 hour ago | hide | 18 comments
4.
DNA seen through the eyes of a coder (berthub.eu)
116 points by dunefox 3 hours ago | hide | 37 comments
5.
Pure Bash lightweight web server (github.com/remileduc)
74 points by turrini 2 hours ago | hide | 46 comments
6.
Parser Combinators in Haskell (serokell.io)
18 points by aroccoli 1 hour ago | hide | 3 comments
7.
DeepMinds New AI with a Memory Outperforms Algorithms 25 Times Its Size (singularityhub.com)
233 points by darkscape 9 hours ago | hide | 88 comments
8.
Tinder just permabanned me or the problem with big tech (paulefou.com)
90 points by svalee 1 hour ago | hide | 106 comments
9.
Rocky Mountain Basic (wikipedia.org)
12 points by mattowen_uk 1 hour ago | hide | 5 comments
10.
Teller Reveals His Secrets (2012) (smithsonianmag.com)
56 points by Tomte 4 hours ago | hide | 26 comments
11.
Heroku Is Currently Down (heroku.com)
129 points by iamricks 2 hours ago | hide | 29 comments
12. Convictional (YC W19) is hiring engineers to build the future of B2B trade-Remote (ashbyhq.com)
2 hours ago | hide
13.
Scientists find preserved dinosaur embryo preparing to hatch like a bird (theguardian.com)
187 points by Petiver 9 hours ago | hide | 111 comments
14.
I did a Mixergy interview so bad they didn't even release it (robfitz.com)
15 points by robfitz 1 hour ago | hide | 7 comments
15.
Now DuckDuckGo is building its own desktop browser (zdnet.com)
132 points by waldekm 2 hours ago | hide | 64 comments
16.
English has been my pain for 15 years (2013) (antirez.com)
105 points by Tomte 1 hour ago | hide | 169 comments
17.
Polish opposition duo hacked with NSO spyware (apnews.com)
102 points by JumpCrisscross 2 hours ago | hide | 35 comments
18.
Linux Has Grown into a Viable PC Gaming Platform and the Steam Stats Prove It (hothardware.com)
119 points by rbanffy 3 hours ago | hide | 105 comments
19.
LGs new 16:18 monitor (theverge.com)
50 points by tosh 1 hour ago | hide | 25 comments
20.
Construction of radio equipment in a Japanese PoW camp (bournemouth.ac.uk)
117 points by marcodiego 9 hours ago | hide | 16 comments
21.
Everything I've seen on optimizing Postgres on ZFS (vadosware.io)
27 points by EntICOnc 4 hours ago | hide | 2 comments
22.
Microsoft Teams: 1 feature, 4 vulnerabilities (positive.security)
269 points by kerm1t 4 hours ago | hide | 196 comments
23.
Analog computers were the most powerful computers for thousands of years [video] (youtube.com)
103 points by jdkee 9 hours ago | hide | 55 comments
24.
Shipwrecks, Stolen Jewels, Skull-Blasting Are Some of This Years Best Mysteries (atlasobscura.com)
8 points by CapitalistCartr 1 hour ago | hide | 1 comment
25.
Isolating Xwayland in a VM (roscidus.com)
94 points by pmarin 9 hours ago | hide | 32 comments
26.
Show HN: Metaheads, a search engine for Facebook comments (metaheads.xyz)
4 points by jawerty 1 hour ago | hide | 15 comments
27.
Quantum theory based on real numbers can be experimentally falsified (nature.com)
159 points by SquibblesRedux 14 hours ago | hide | 93 comments
28.
Founder of Black Girls Code has been ousted as head of the nonprofit (businessinsider.com)
29 points by healsdata 1 hour ago | hide | 7 comments
29.
Waffle House Poet Laureate (2019) (atlantamagazine.com)
5 points by brudgers 1 hour ago | hide | 4 comments
30.
Earths magnetic field illuminates Biblical history (economist.com)
46 points by helsinkiandrew 8 hours ago | hide | 17 comments
More
`;
trilight('metahead search', doc.toLocaleLowerCase().replace(/\s+/g, ' '));
}
console.log(JSON.stringify(highlight(
query, doc
).map(({fragment:{text,offset}}) => offset + ':' + text), null, 2));
console.log(trilight('metahead search', doc.toLocaleLowerCase().replace(/\s+/g, ' ')));
}
*/

View File

@ -7,7 +7,7 @@ import {
say, sleep, APP_ROOT
} from './common.js';
import Archivist from './archivist.js';
import {highlight} from './highlighter.js';
import {trilight, highlight} from './highlighter.js';
const SITE_PATH = path.resolve(APP_ROOT, '..', 'public');
@ -63,10 +63,15 @@ function addHandlers() {
}, null, 2));
} else {
results.forEach(r => {
/*
r.snippet = '... ' + highlight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH})
.sort(({fragment:{offset:a}}, {fragment:{offset:b}}) => a-b)
.map(hl => Archivist.findOffsets(query, hl.fragment.text))
.join(' ... ');
*/
r.snippet = '... ' + trilight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH})
.map(segment => Archivist.findOffsets(query, segment))
.join(' ... ');
});
res.end(SearchResultView({results, query, HL}));
}