const { parse } = require('node-html-parser'); const fs = require('fs/promises'); async function parseHtml() { const pages = await fs.readdir('./pages'); const pdfUrls = []; for (const page of pages) { const html = await fs.readFile(`./pages/${page}`, 'utf8'); const root = parse(html); const potentialUrls = root.querySelectorAll('a'); let validUrls = 0; for (const anchor of potentialUrls) { const url = anchor.getAttribute('href') || ''; // URL found, but it's not a PDF if (!url.toLowerCase().endsWith('.pdf')) { // console.error(`${url} is not a pdf`); continue; } const fullUrl = `http://www.survivorlibrary.com${url}`; // Duplicate if (pdfUrls.includes(fullUrl)) { continue; } // console.log(`Found PDF: ${fullUrl}`); pdfUrls.push(fullUrl); validUrls++; } console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`); } await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls)); await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n')); const scriptOutput = pdfUrls.map(url => { const filename = url.split('/').pop(); return `[ ! -f "${filename}" ] && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - "${url}"`; }); await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n')); } parseHtml();