SurvivorLibraryScrape/parse_html_pages.js
2022-02-03 15:53:39 +01:00

53 lines
1.6 KiB
JavaScript

const { parse } = require('node-html-parser');
const fs = require('fs/promises');
async function parseHtml()
{
const pages = await fs.readdir('./pages');
const pdfUrls = [];
for (const page of pages)
{
const html = await fs.readFile(`./pages/${page}`, 'utf8');
const root = parse(html);
const potentialUrls = root.querySelectorAll('a');
let validUrls = 0;
for (const anchor of potentialUrls)
{
const url = anchor.getAttribute('href') || '';
// URL found, but it's not a PDF
if (!url.toLowerCase().endsWith('.pdf')) {
// console.error(`${url} is not a pdf`);
continue;
}
const fullUrl = `http://www.survivorlibrary.com${url}`;
// Duplicate
if (pdfUrls.includes(fullUrl)) {
continue;
}
// console.log(`Found PDF: ${fullUrl}`);
pdfUrls.push(fullUrl);
validUrls++;
}
console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
}
await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
const scriptOutput = pdfUrls.map(url => {
const filename = url.split('/').pop();
return `[ ! -f "${filename}" ] && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - "${url}"`;
});
await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));
}
parseHtml();