SurvivorLibraryScrape/parse_html_pages.js

const { parse } = require('node-html-parser');
const fs = require('fs/promises');

/**
 * This is a really hacky and ugly method, but essentially we're relying on Node.js to
 * parse the HTML and grab all the PDF links.
 *
 * To be honest, this could've been done with some fancy regex or whatever, but who cares.
 */
async function parseHtml()
{
    const pages = await fs.readdir('./pages');
    const pdfUrls = [];
    const folders = [];
    const folderLinks = {};

    const specialFolderCases = {
        AEROPLANES: 'Aeroplanes',
        ACCOUNTING: 'Accounting',
    };

    for (const page of pages)
    {
        const html = await fs.readFile(`./pages/${page}`, 'utf8');
        const root = parse(html);
        const potentialUrls = root.querySelectorAll('a');

        let folderName = root.querySelector('title').textContent;
        // Normalize the folder name
        folderName = folderName.replaceAll('/', '_')
                               .replace('Library-', '')
                               .replace('LIBRARY-', '')
                               .trim()
                               .replaceAll(/\s{2,}/g, ' ');

        // A few special cases, let's just handle these explicitly
        if (folderName in specialFolderCases) {
            folderName = specialFolderCases[folderName];
        }

        if (!folders.includes(folderName)) {
            folders.push(folderName);
            folderLinks[folderName] = [];
        }

        let validUrls = 0;

        for (const anchor of potentialUrls)
        {
            const url = anchor.getAttribute('href') || '';

            // URL found, but it's not a PDF
            if (!url.toLowerCase().endsWith('.pdf')) {
                // console.error(`${url} is not a pdf`);
                continue;
            }

            // All URLs in the `href` attributes are relative, so we fix em up.
            const fullUrl = `http://www.survivorlibrary.com${url}`;
            const filename = url.split('/').pop();
            folderLinks[folderName].push(filename);

            // Duplicate
            if (pdfUrls.includes(fullUrl)) {
                continue;
            }

            // console.log(`Found PDF: ${fullUrl}`);
            pdfUrls.push(fullUrl);
            validUrls++;
        }

        console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
    }

    /**
     * Since the `pdfUrls.sh` script just downloads all the PDFs into one directory,
     * We create a script that hardlinks each file to the correct categories.
     *
     * There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
     *
     * Since the files are *hardlinks*, we don't need to worry about using extra space.
     * You can safely delete the original, "unsorted" files and they will still be saved.
     * However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted.
     *
     * Anyways, this method is kind of janky. We're basically generating a bunch of shell commands:
     * - Create a `Sorted` folder
     * - Create a `Sorted/<category>` folder
     * - Hardlink the unsorted `<filename.pdf>` to `Sorted/<category>/<filename.pdf>`
     *
     * If it works, it ain't stupid ¯\_(ツ)_/¯
     */
    let folderLinkCmds = ['mkdir -p Sorted'];
    for (const folder of folders)
    {
        const links = folderLinks[folder];
        folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`);

        for (const filename of links)
        {
            folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`);
        }
    }

    await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
    await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
    await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4));
    await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n'));

    /**
     * It seems the web server for SurvivorLibrary doesn't support
     * the `Range` HTTP header. We can't just "continue" a download.
     *
     * I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
     * before skipping it (if it does exist).
     *
     * As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid.
     * Keep in mind that most of the PDFs that are invalid, are also corrupted on Survivor Library's website.
     * Meaning it's the _source_ that's corrupt, not the downloaded file specifically.
     */
    const scriptOutput = pdfUrls.map(url => {
        const filename = url.split('/').pop();
        return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`;
    });

    await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));
}

parseHtml();