mirror of
https://github.com/c9fe/22120.git
synced 2024-10-28 06:32:28 +01:00
"Succeeded in getting basic page text (all iframes and shadowroots OK)"
This commit is contained in:
parent
463083f327
commit
f08e9a1b5f
80
archivist.js
80
archivist.js
@ -4,7 +4,10 @@ import path from 'path';
|
||||
import fs from 'fs';
|
||||
import FlexSearch from 'flexsearch';
|
||||
import args from './args.js';
|
||||
import {APP_ROOT, context, sleep, DEBUG, CHECK_INTERVAL} from './common.js';
|
||||
import {
|
||||
APP_ROOT, context, sleep, DEBUG,
|
||||
CHECK_INTERVAL, TEXT_NODE, FORBIDDEN_TEXT_PARENT
|
||||
} from './common.js';
|
||||
import {connect} from './protocol.js';
|
||||
import {getInjection} from './public/injection.js';
|
||||
import {BLOCKED_BODY, BLOCKED_CODE, BLOCKED_HEADERS} from './blockedResponse.js';
|
||||
@ -239,36 +242,12 @@ async function collect({chrome_port:port, mode} = {}) {
|
||||
const flatDoc = await send("DOMSnapshot.captureSnapshot", {
|
||||
computedStyles: [],
|
||||
}, sessionId);
|
||||
console.log(flatDoc);
|
||||
processDoc(flatDoc);
|
||||
// we collect TextNodes, ignoring any under script, style or an attribute
|
||||
/*
|
||||
const ignoredParentIds = new Set(
|
||||
pageNodes.filter(
|
||||
({localName,nodeType}) => IGNORE_NODES.has(localName) || nodeType == AttributeNode
|
||||
).map(({nodeId}) => nodeId)
|
||||
);
|
||||
const pageText = pageNodes.filter(
|
||||
({nodeType,parentId}) => nodeType == TextNode && ! ignoredParentIds.has(parentId)
|
||||
).reduce(
|
||||
(Text, {nodeValue}) => Text + nodeValue + ' ',
|
||||
''
|
||||
);
|
||||
const pageText = processDoc(flatDoc);
|
||||
//Flex.updateAsync(info.url, pageText).then(r => console.log('Search index update done'));
|
||||
//Flex.addAsync(info.url, pageText).then(r => console.log('Search index update done'));
|
||||
const res = Flex.add(info.url, pageText);
|
||||
DEBUG && console.log('Flex Index Result>>>', res);
|
||||
|
||||
if ( false ) {
|
||||
console.log({
|
||||
page : {
|
||||
url: info.url,
|
||||
title: info.title,
|
||||
text: pageText
|
||||
}
|
||||
});
|
||||
}
|
||||
//Flex.updateAsync(info.url, pageText).then(r => console.log('Search index update done'));
|
||||
//Flex.addAsync(info.url, pageText).then(r => console.log('Search index update done'));
|
||||
const res = Flex.add(info.url, pageText);
|
||||
console.log(res);
|
||||
*/
|
||||
State.Indexing.delete(info.targetId);
|
||||
|
||||
console.log(`Indexed ${info.url} to ${info.title}`);
|
||||
@ -315,8 +294,45 @@ async function collect({chrome_port:port, mode} = {}) {
|
||||
}
|
||||
|
||||
function processDoc({documents, strings}) {
|
||||
const {nodes} = documents[0];
|
||||
console.log(nodes);
|
||||
/*
|
||||
Info
|
||||
Implementation Notes
|
||||
|
||||
1. Code uses spec at:
|
||||
https://chromedevtools.github.io/devtools-protocol/tot/DOMSnapshot/#type-NodeTreeSnapshot
|
||||
|
||||
2. Note that so far the below will NOT produce text for and therefore we will NOT
|
||||
index textarea or input elements. We can access those by using the textValue and
|
||||
inputValue array properties of the doc, if we want to implement that.
|
||||
*/
|
||||
|
||||
const texts = [];
|
||||
for( const doc of documents) {
|
||||
const textIndices = doc.nodes.nodeType.reduce((Indices, type, index) => {
|
||||
if ( type === TEXT_NODE ) {
|
||||
const parentIndex = doc.nodes.parentIndex[index];
|
||||
const forbiddenParent = parentIndex >= 0 &&
|
||||
FORBIDDEN_TEXT_PARENT.has(strings[
|
||||
doc.nodes.nodeName[
|
||||
parentIndex
|
||||
]
|
||||
])
|
||||
if ( ! forbiddenParent ) {
|
||||
Indices.push(index);
|
||||
}
|
||||
}
|
||||
return Indices;
|
||||
}, []);
|
||||
textIndices.forEach(index => {
|
||||
const stringsIndex = doc.nodes.nodeValue[index];
|
||||
const text = strings[stringsIndex];
|
||||
texts.push(text);
|
||||
});
|
||||
}
|
||||
|
||||
const pageText = texts.filter(t => t.trim()).join(' ');
|
||||
DEBUG && console.log('Page text>>>', pageText);
|
||||
return pageText;
|
||||
}
|
||||
|
||||
async function attachToTarget(targetInfo) {
|
||||
|
@ -24,6 +24,12 @@ export const context = Context;
|
||||
export const DEBUG = process.env.DEBUG_22120 || false;
|
||||
|
||||
export const CHECK_INTERVAL = 400;
|
||||
export const TEXT_NODE = 3;
|
||||
export const FORBIDDEN_TEXT_PARENT = new Set([
|
||||
'STYLE',
|
||||
'SCRIPT',
|
||||
'NOSCRIPT'
|
||||
]);
|
||||
|
||||
export const NO_SANDBOX = process.env.DEBUG_22120 || false;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user