const { parse } = require('node-html-parser'); const fs = require('fs/promises'); /** * This is a really hacky and ugly method, but essentially we're relying on Node.js to * parse the HTML and grab all the PDF links. * * To be honest, this could've been done with some fancy regex or whatever, but who cares. */ async function parseHtml() { const pages = await fs.readdir('./pages'); const pdfUrls = []; const folders = []; const folderLinks = {}; const specialFolderCases = { AEROPLANES: 'Aeroplanes', ACCOUNTING: 'Accounting', }; for (const page of pages) { const html = await fs.readFile(`./pages/${page}`, 'utf8'); const root = parse(html); const potentialUrls = root.querySelectorAll('a'); let folderName = root.querySelector('title').textContent; // Normalize the folder name folderName = folderName.replaceAll('/', '_') .replace('Library-', '') .replace('LIBRARY-', '') .trim() .replaceAll(/\s{2,}/g, ' '); // A few special cases, let's just handle these explicitly if (folderName in specialFolderCases) { folderName = specialFolderCases[folderName]; } if (!folders.includes(folderName)) { folders.push(folderName); folderLinks[folderName] = []; } let validUrls = 0; for (const anchor of potentialUrls) { const url = anchor.getAttribute('href') || ''; // URL found, but it's not a PDF if (!url.toLowerCase().endsWith('.pdf')) { // console.error(`${url} is not a pdf`); continue; } // All URLs in the `href` attributes are relative, so we fix em up. const fullUrl = `http://www.survivorlibrary.com${url}`; const filename = url.split('/').pop(); folderLinks[folderName].push(filename); // Duplicate if (pdfUrls.includes(fullUrl)) { continue; } // console.log(`Found PDF: ${fullUrl}`); pdfUrls.push(fullUrl); validUrls++; } console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`); } /** * Since the `pdfUrls.sh` script just downloads all the PDFs into one directory, * We create a script that hardlinks each file to the correct categories. * * There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo. * * Since the files are *hardlinks*, we don't need to worry about using extra space. * You can safely delete the original, "unsorted" files and they will still be saved. * However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted. * * Anyways, this method is kind of janky. We're basically generating a bunch of shell commands: * - Create a `Sorted` folder * - Create a `Sorted/` folder * - Hardlink the unsorted `` to `Sorted//` * * If it works, it ain't stupid ¯\_(ツ)_/¯ */ let folderLinkCmds = ['mkdir -p Sorted']; for (const folder of folders) { const links = folderLinks[folder]; folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`); for (const filename of links) { folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`); } } await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls)); await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n')); await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4)); await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n')); /** * It seems the web server for SurvivorLibrary doesn't support * the `Range` HTTP header. We can't just "continue" a download. * * I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_ * before skipping it (if it does exist). * * As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid. * Keep in mind that most of the PDFs that are invalid, are also corrupted on Survivor Library's website. * Meaning it's the _source_ that's corrupt, not the downloaded file specifically. */ const scriptOutput = pdfUrls.map(url => { const filename = url.split('/').pop(); return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`; }); await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n')); } parseHtml();