53 lines
1.6 KiB
JavaScript
53 lines
1.6 KiB
JavaScript
const { parse } = require('node-html-parser');
|
|
const fs = require('fs/promises');
|
|
|
|
async function parseHtml()
|
|
{
|
|
const pages = await fs.readdir('./pages');
|
|
const pdfUrls = [];
|
|
|
|
for (const page of pages)
|
|
{
|
|
const html = await fs.readFile(`./pages/${page}`, 'utf8');
|
|
const root = parse(html);
|
|
const potentialUrls = root.querySelectorAll('a');
|
|
|
|
let validUrls = 0;
|
|
|
|
for (const anchor of potentialUrls)
|
|
{
|
|
const url = anchor.getAttribute('href') || '';
|
|
|
|
// URL found, but it's not a PDF
|
|
if (!url.toLowerCase().endsWith('.pdf')) {
|
|
// console.error(`${url} is not a pdf`);
|
|
continue;
|
|
}
|
|
|
|
const fullUrl = `http://www.survivorlibrary.com${url}`;
|
|
|
|
// Duplicate
|
|
if (pdfUrls.includes(fullUrl)) {
|
|
continue;
|
|
}
|
|
|
|
// console.log(`Found PDF: ${fullUrl}`);
|
|
pdfUrls.push(fullUrl);
|
|
validUrls++;
|
|
}
|
|
|
|
console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
|
|
}
|
|
|
|
await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
|
|
await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
|
|
|
|
const scriptOutput = pdfUrls.map(url => {
|
|
const filename = url.split('/').pop();
|
|
return `[ ! -f "${filename}" ] && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - "${url}"`;
|
|
});
|
|
|
|
await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));
|
|
}
|
|
|
|
parseHtml(); |