1
0
mirror of https://github.com/c9fe/22120.git synced 2024-11-14 23:12:30 +01:00
22120/archivist.js
2021-12-15 23:34:30 +08:00

663 lines
20 KiB
JavaScript

import hasha from 'hasha';
import {URL} from 'url';
import path from 'path';
import fs from 'fs';
import FlexSearch from 'flexsearch';
import args from './args.js';
import {
APP_ROOT, context, sleep, DEBUG,
clone,
CHECK_INTERVAL, TEXT_NODE, FORBIDDEN_TEXT_PARENT
} from './common.js';
import {connect} from './protocol.js';
import {getInjection} from './public/injection.js';
import {BLOCKED_BODY, BLOCKED_CODE, BLOCKED_HEADERS} from './blockedResponse.js';
//import xapian from 'xapian';
// cache is a simple map
// that holds the serialized requests
// that are saved on disk
let Fs, Mode, Close;
const {Index, registerCharset, registerLanguage} = FlexSearch;
const FLEX_OPTS = {
context: true,
};
const Targets = new Map();
const Flex = new Index(FLEX_OPTS);
const Cache = new Map();
const Indexing = new Set();
const State = {
Indexing,
Cache,
SavedCacheFilePath: null,
SavedIndexFilePath: null,
saver: null,
indexSaver: null,
saveInProgress: false
}
const IGNORE_NODES = new Set([
'script',
'style',
'noscript',
'datalist'
]);
const TextNode = 3;
const AttributeNode = 2;
const Archivist = {
collect, getMode, changeMode, shutdown, handlePathChanged, saveIndex
}
const BODYLESS = new Set([
301,
302,
303,
307
]);
const NEVER_CACHE = new Set([
`http://localhost:${args.server_port}`,
`http://localhost:${args.chrome_port}`
]);
const SORT_URLS = ([urlA],[urlB]) => urlA < urlB ? -1 : 1;
const CACHE_FILE = args.cache_file;
const INDEX_FILE = args.index_file;
const NO_FILE = args.no_file;
const TBL = /:\/\//g;
const HASH_OPTS = {algorithm: 'sha1'};
const UNCACHED_BODY = b64('We have not saved this data');
const UNCACHED_CODE = 404;
const UNCACHED_HEADERS = [
{ name: 'Content-type', value: 'text/plain' },
{ name: 'Content-length', value: '26' }
];
const UNCACHED = {
body:UNCACHED_BODY, responseCode:UNCACHED_CODE, responseHeaders:UNCACHED_HEADERS
}
export default Archivist;
async function collect({chrome_port:port, mode} = {}) {
if ( context == 'node' ) {
const {default:fs} = await import('fs');
Fs = fs;
}
const {library_path} = args;
const {send, on, close} = await connect({port});
const Sessions = new Map();
const Installations = new Set();
const ConfirmedInstalls = new Set();
const DELAY = 100; // 500 ?
Close = close;
Mode = mode;
let requestStage;
loadFiles();
clearSavers();
if ( Mode == 'save' ) {
requestStage = "Response";
// in case we get a updateBasePath call before an interval
// and we don't clear it in time, leading us to erroneously save the old
// cache to the new path, we always used our saved copy
State.saver = setInterval(() => saveCache(State.SavedCacheFilePath), 10000);
// we use timeout because we can trigger this ourself
// so in order to not get a race condition (overlapping calls) we ensure
// only 1 call at 1 time
State.indexSaver = setTimeout(() => saveIndex(State.SavedIndexFilePath), 10001);
} else if ( Mode == 'serve' ) {
requestStage = "Request";
} else {
throw new TypeError(`Must specify mode`);
}
//on("Target.targetInfoChanged", displayTargetInfo);
on("Target.targetInfoChanged", attachToTarget);
on("Target.targetInfoChanged", reloadIfNotLive);
on("Target.targetInfoChanged", updateTargetInfo);
on("Target.targetInfoChanged", indexURL);
on("Target.attachedToTarget", installForSession);
on("Fetch.requestPaused", cacheRequest);
on("Runtime.consoleAPICalled", handleMessage);
await send("Target.setDiscoverTargets", {discover:true});
await send("Target.setAutoAttach", {autoAttach:true, waitForDebuggerOnStart:false, flatten: true});
await send("Fetch.enable", {
patterns: [
{
urlPattern: "http*://*",
requestStage
}
],
});
const {targetInfos:targets} = await send("Target.getTargets", {});
const pageTargets = targets.filter(({type}) => type == 'page');
pageTargets.forEach(attachToTarget);
function guard(func, text = '') {
return (...args) => {
//DEBUG && console.log({text, func:func.name, args:JSON.stringify(args,null,2)});
return func(...args);
};
}
function handleMessage(args) {
const {type, args:[{value:strVal}], context} = args;
if ( type == 'info' ) {
try {
const val = JSON.parse(strVal);
// possible messages
const {install, titleChange} = val;
switch(true) {
case !!install: {
confirmInstall({install});
}; break;
case !!titleChange: {
reindexOnTitleChange({titleChange});
}; break;
default: {
if ( DEBUG ) {
console.warn(`Unknown message`, strVal);
}
}; break;
}
} catch(e) {
DEBUG && console.info('Not the message we expected to confirm install. This is OK.', {originalMessage:args});
} finally {}
}
}
function confirmInstall({install}) {
const {sessionId} = install;
if ( ! ConfirmedInstalls.has(sessionId) ) {
ConfirmedInstalls.add(sessionId);
console.log({confirmedInstall:val, context});
}
}
async function reindexOnTitleChange({titleChange}) {
const {currentTitle, url, sessionId} = titleChange;
const latestTargetInfo = await untilHas(Targets, sessionId);
latestTargetInfo.title = currentTitle;
Targets.set(sessionId, clone(latestTargetInfo));
indexURL({targetInfo:latestTargetInfo});
}
function displayTargetInfo({targetInfo}) {
if ( targetInfo.type === 'page' ) {
console.log("Target info", JSON.stringify(targetInfo, null, 2));
}
}
function updateTargetInfo({targetInfo}) {
if ( targetInfo.type === 'page' ) {
const sessionId = Sessions.get(targetInfo.targetId);
if ( sessionId ) {
const existingTargetInfo = Targets.get(sessionId);
// if we have an existing target info for this URL and have saved an updated title
if ( existingTargetInfo && existingTargetInfo.url === targetInfo.url ) {
// keep that title (because targetInfo does not reflect the latest title)
if ( existingTargetInfo.title !== existingTargetInfo.url ) {
console.log('Setting title to existing', existingTargetInfo);
targetInfo.title = existingTargetInfo.title;
}
}
Targets.set(sessionId, clone(targetInfo));
}
}
}
async function reloadIfNotLive({targetInfo}) {
if ( Mode == 'serve' ) return;
const {attached, type} = targetInfo;
if ( attached && type == 'page' ) {
const {url, targetId} = targetInfo;
const sessionId = Sessions.get(targetId);
if ( !!sessionId && !ConfirmedInstalls.has(sessionId) ) {
console.log({reloadingAsNotConfirmedInstalled:{url, sessionId}});
send("Page.stopLoading", {}, sessionId);
send("Page.reload", {}, sessionId);
}
}
}
function neverCache(url) {
return url == "about:blank" || url?.startsWith('chrome') || NEVER_CACHE.has(url);
}
async function installForSession({sessionId, targetInfo, waitingForDebugger}) {
if ( ! sessionId ) {
throw new TypeError(`installForSession needs a sessionId`);
}
const {targetId, url} = targetInfo;
if ( Mode == 'serve' ) return;
if ( dontInstall(targetInfo) ) return;
if ( targetInfo.type != 'page' ) return;
if ( Installations.has(sessionId) ) return;
console.log("installForSession running on " + targetId);
Sessions.set(targetId, sessionId);
Targets.set(sessionId, clone(targetInfo));
if ( Mode == 'save' ) {
send("Network.setCacheDisabled", {cacheDisabled:true}, sessionId);
send("Network.setBypassServiceWorker", {bypass:true}, sessionId);
await send("Runtime.enable", {}, sessionId);
await send("Page.enable", {}, sessionId);
await send("DOMSnapshot.enable", {}, sessionId);
await send("Page.addScriptToEvaluateOnNewDocument", {
source: getInjection({sessionId}),
worldName: "Context-22120-Indexing"
}, sessionId);
DEBUG && console.log("Just request install", targetId, url);
}
Installations.add(sessionId);
console.log('Installed sessionId', sessionId);
indexURL({targetInfo});
}
async function indexURL({targetInfo:info = {}, sessionId, waitingForDebugger} = {}) {
if ( Mode == 'serve' ) return;
if ( info.type != 'page' ) return;
if ( ! info.url || info.url == 'about:blank' ) return;
if ( info.url.startsWith('chrome') ) return;
if ( dontCache(info) ) return;
if ( State.Indexing.has(info.targetId) ) return;
State.Indexing.add(info.targetId);
if ( ! sessionId ) {
sessionId = await untilHas(Sessions, info.targetId);
}
if ( !Installations.has(sessionId) ) {
await untilHas(Installations, sessionId);
}
const {title:latestTitle} = Targets.get(sessionId);
State.Index.set(info.url, latestTitle);
send("DOMSnapshot.enable", {}, sessionId);
await sleep(500);
const flatDoc = await send("DOMSnapshot.captureSnapshot", {
computedStyles: [],
}, sessionId);
const pageText = processDoc(flatDoc);
//Flex.updateAsync(info.url, pageText).then(r => console.log('Search index update done'));
//Flex.addAsync(info.url, pageText).then(r => console.log('Search index update done'));
const res = Flex.update(info.url, pageText);
DEBUG && console.log('Flex Index Result>>>', res);
State.Indexing.delete(info.targetId);
const {title, url} = info;
console.log({title, url, indexed: true, searchable: true, indexType: 'full text and full content'});
}
async function untilHas(thing, key) {
if ( thing instanceof Map ) {
if ( thing.has(key) ) {
return thing.get(key);
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing.has(key) ) {
clearInterval(checker);
resolve(thing.get(key));
} else {
console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else if ( thing instanceof Set ) {
if ( thing.has(key) ) {
return true;
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing.has(key) ) {
clearInterval(checker);
resolve(true);
} else {
console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else {
throw new TypeError(`untilHas with thing of type ${thing} is not yet implemented!`);
}
}
function processDoc({documents, strings}) {
/*
Info
Implementation Notes
1. Code uses spec at:
https://chromedevtools.github.io/devtools-protocol/tot/DOMSnapshot/#type-NodeTreeSnapshot
2. Note that so far the below will NOT produce text for and therefore we will NOT
index textarea or input elements. We can access those by using the textValue and
inputValue array properties of the doc, if we want to implement that.
*/
const texts = [];
for( const doc of documents) {
const textIndices = doc.nodes.nodeType.reduce((Indices, type, index) => {
if ( type === TEXT_NODE ) {
const parentIndex = doc.nodes.parentIndex[index];
const forbiddenParent = parentIndex >= 0 &&
FORBIDDEN_TEXT_PARENT.has(strings[
doc.nodes.nodeName[
parentIndex
]
])
if ( ! forbiddenParent ) {
Indices.push(index);
}
}
return Indices;
}, []);
textIndices.forEach(index => {
const stringsIndex = doc.nodes.nodeValue[index];
if ( stringsIndex >= 0 ) {
const text = strings[stringsIndex];
texts.push(text);
}
});
}
const pageText = texts.filter(t => t.trim()).join(' ');
DEBUG && console.log('Page text>>>', pageText);
return pageText;
}
async function attachToTarget(targetInfo) {
if ( dontInstall(targetInfo) ) return;
const {url} = targetInfo;
if ( url && targetInfo.type == 'page' ) {
if ( ! targetInfo.attached ) {
const {sessionId} = await send("Target.attachToTarget", {
targetId: targetInfo.targetId,
flatten: true
});
Sessions.set(targetInfo.targetId, sessionId);
}
}
}
async function cacheRequest(pausedRequest) {
const {
requestId, request, resourceType,
responseStatusCode, responseHeaders, responseErrorReason
} = pausedRequest;
const {url} = request;
const isNavigationRequest = resourceType == "Document";
const isFont = resourceType == "Font";
if ( dontCache(request) ) {
DEBUG && console.log("Not caching", request.url);
return send("Fetch.continueRequest", {requestId});
}
const key = serializeRequest(request);
if ( Mode == 'serve' ) {
if ( State.Cache.has(key) ) {
let {body, responseCode, responseHeaders} = await getResponseData(State.Cache.get(key));
responseCode = responseCode || 200;
//DEBUG && console.log("Fulfilling", key, responseCode, responseHeaders, body.slice(0,140));
DEBUG && console.log("Fulfilling", key, responseCode, body.slice(0,140));
await send("Fetch.fulfillRequest", {
requestId, body, responseCode, responseHeaders
});
} else {
DEBUG && console.log("Sending cache stub", key);
await send("Fetch.fulfillRequest", {
requestId, ...UNCACHED
});
}
} else if ( Mode == 'save' ) {
const response = {key, responseCode: responseStatusCode, responseHeaders};
const resp = await getBody({requestId, responseStatusCode});
if ( !! resp ) {
let {body, base64Encoded} = resp;
if ( ! base64Encoded ) {
body = b64(body);
}
response.body = body;
const responsePath = await saveResponseData(key, request.url, response);
State.Cache.set(key, responsePath);
} else {
DEBUG && console.warn("get response body error", key, responseStatusCode, responseHeaders, pausedRequest.responseErrorReason);
response.body = '';
}
await sleep(DELAY);
if ( !isFont && responseErrorReason ) {
if ( isNavigationRequest ) {
await send("Fetch.fulfillRequest", {
requestId,
responseHeaders: BLOCKED_HEADERS,
responseCode: BLOCKED_CODE,
body: Buffer.from(responseErrorReason).toString("base64"),
},
);
} else {
await send("Fetch.failRequest", {
requestId,
errorReason: responseErrorReason
},
);
}
} else {
try {
await send("Fetch.continueRequest", {
requestId,
},
);
} catch(e) {
console.warn("Issue with continuing request", e, message);
}
}
}
}
async function getBody({requestId, responseStatusCode}) {
let resp;
if ( ! BODYLESS.has(responseStatusCode) ) {
resp = await send("Fetch.getResponseBody", {requestId});
} else {
resp = {body:'', base64Encoded:true};
}
return resp;
}
function dontInstall(targetInfo) {
return targetInfo.type !== 'page';
}
function dontCache(request) {
if ( ! request.url ) return true;
if ( neverCache(request.url) ) return true;
const url = new URL(request.url);
return NEVER_CACHE.has(url.origin) || (State.No && State.No.test(url.host));
}
async function getResponseData(path) {
try {
return JSON.parse(await Fs.promises.readFile(path));
} catch(e) {
console.warn(`Error with ${path}`, e);
return UNCACHED;
}
}
async function saveResponseData(key, url, response) {
const origin = (new URL(url).origin);
let originDir = State.Cache.get(origin);
if ( ! originDir ) {
originDir = path.resolve(library_path(), origin.replace(TBL, '_'));
try {
await Fs.promises.mkdir(originDir, {recursive:true});
} catch(e) {
console.warn(`Issue with origin directory ${path.dirname(responsePath)}`, e);
}
State.Cache.set(origin, originDir);
}
const fileName = `${await hasha(key, HASH_OPTS)}.json`;
const responsePath = path.resolve(originDir, fileName);
await Fs.promises.writeFile(responsePath, JSON.stringify(response,null,2));
return responsePath;
}
function serializeRequest(request) {
const {url, urlFragment, method, headers, postData, hasPostData} = request;
/**
let sortedHeaders = '';
for( const key of Object.keys(headers).sort() ) {
sortedHeaders += `${key}:${headers[key]}/`;
}
**/
return `${method}${url}`;
//return `${url}${urlFragment}:${method}:${sortedHeaders}:${postData}:${hasPostData}`;
}
}
function clearSavers() {
if ( State.saver ) {
clearInterval(State.saver);
State.saver = null;
}
if ( State.indexSaver ) {
clearTimeout(State.indexSaver);
State.indexSaver = null;
}
}
function loadFiles() {
try {
State.Cache = new Map(JSON.parse(Fs.readFileSync(CACHE_FILE())));
State.Index = new Map(JSON.parse(Fs.readFileSync(INDEX_FILE())));
State.SavedCacheFilePath = CACHE_FILE();
State.SavedIndexFilePath = INDEX_FILE();
DEBUG && console.log(`Loaded cache key file ${CACHE_FILE()}`);
DEBUG && console.log(`Loaded index file ${INDEX_FILE()}`);
} catch(e) {
DEBUG && console.warn('Error reading file', e);
State.Cache = new Map();
State.Index = new Map();
}
try {
if ( !Fs.existsSync(NO_FILE()) ) {
DEBUG && console.log(`The 'No file' (${NO_FILE()}) does not exist, ignoring...`);
State.No = null;
} else {
State.No = new RegExp(JSON.parse(Fs.readFileSync(NO_FILE))
.join('|')
.replace(/\./g, '\\.')
.replace(/\*/g, '.*')
.replace(/\?/g, '.?')
);
}
} catch(e) {
DEBUG && console.warn('Error compiling regex from No file', e);
State.No = null;
}
}
function getMode() { return Mode; }
async function changeMode(mode) {
DEBUG && console.log({modeChange:mode});
clearSavers();
saveCache();
saveIndex();
Close && Close();
Mode = mode;
await collect({chrome_port:args.chrome_port, mode});
}
function handlePathChanged() {
DEBUG && console.log({libraryPathChange:args.library_path()});
clearSavers();
// saves the old cache path
saveCache(State.SavedCacheFilePath);
saveIndex(State.SavedIndexFilePath);
// reloads from new path and updates Saved FilePaths
loadFiles();
}
function saveCache(path) {
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || CACHE_FILE());
Fs.writeFileSync(path || CACHE_FILE(), JSON.stringify([...State.Cache.entries()],null,2));
}
}
function saveIndex(path) {
if ( State.saveInProgress ) return;
State.saveInProgress = true;
clearTimeout(State.indexSaver);
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || INDEX_FILE());
//DEBUG && console.log([...State.Index.entries()].sort(SORT_URLS));
Fs.writeFileSync(
path || INDEX_FILE(),
JSON.stringify([...State.Index.entries()].sort(SORT_URLS),null,2)
);
}
State.indexSaver = setTimeout(saveIndex, 10001);
State.saveInProgress = false;
}
function shutdown() {
DEBUG && console.log(`Archivist shutting down...`);
saveCache();
Close && Close();
DEBUG && console.log(`Archivist shut down.`);
}
function b64(s) {
if ( context == 'node' ) {
return Buffer.from(s).toString('base64');
} else {
return btoa(s);
}
}