SurvivorLibraryScrape/parse_html_pages.js

129 lines
5.0 KiB
JavaScript

const { parse } = require('node-html-parser');
const fs = require('fs/promises');
/**
* This is a really hacky and ugly method, but essentially we're relying on Node.js to
* parse the HTML and grab all the PDF links.
*
* To be honest, this could've been done with some fancy regex or whatever, but who cares.
*/
async function parseHtml()
{
const pages = await fs.readdir('./pages');
const pdfUrls = [];
const folders = [];
const folderLinks = {};
const specialFolderCases = {
AEROPLANES: 'Aeroplanes',
ACCOUNTING: 'Accounting',
};
for (const page of pages)
{
const html = await fs.readFile(`./pages/${page}`, 'utf8');
const root = parse(html);
const potentialUrls = root.querySelectorAll('a');
let folderName = root.querySelector('title').textContent;
// Normalize the folder name
folderName = folderName.replaceAll('/', '_')
.replace('Library-', '')
.replace('LIBRARY-', '')
.trim()
.replaceAll(/\s{2,}/g, ' ');
// A few special cases, let's just handle these explicitly
if (folderName in specialFolderCases) {
folderName = specialFolderCases[folderName];
}
if (!folders.includes(folderName)) {
folders.push(folderName);
folderLinks[folderName] = [];
}
let validUrls = 0;
for (const anchor of potentialUrls)
{
const url = anchor.getAttribute('href') || '';
// URL found, but it's not a PDF
if (!url.toLowerCase().endsWith('.pdf')) {
// console.error(`${url} is not a pdf`);
continue;
}
// All URLs in the `href` attributes are relative, so we fix em up.
const fullUrl = `http://www.survivorlibrary.com${url}`;
const filename = url.split('/').pop();
folderLinks[folderName].push(filename);
// Duplicate
if (pdfUrls.includes(fullUrl)) {
continue;
}
// console.log(`Found PDF: ${fullUrl}`);
pdfUrls.push(fullUrl);
validUrls++;
}
console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
}
/**
* Since the `pdfUrls.sh` script just downloads all the PDFs into one directory,
* We create a script that hardlinks each file to the correct categories.
*
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
*
* Since the files are *hardlinks*, we don't need to worry about using extra space.
* You can safely delete the original, "unsorted" files and they will still be saved.
* However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted.
*
* Anyways, this method is kind of janky. We're basically generating a bunch of shell commands:
* - Create a `Sorted` folder
* - Create a `Sorted/<category>` folder
* - Hardlink the unsorted `<filename.pdf>` to `Sorted/<category>/<filename.pdf>`
*
* If it works, it ain't stupid ¯\_(ツ)_/¯
*/
let folderLinkCmds = ['mkdir -p Sorted'];
for (const folder of folders)
{
const links = folderLinks[folder];
folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`);
for (const filename of links)
{
folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`);
}
}
await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4));
await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n'));
/**
* It seems the web server for SurvivalLibrary doesn't support
* the `Range` HTTP header. We can't just "continue" a download.
*
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
* before skipping it (if it does exist).
*
* As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid.
* Keep in mind that most of the PDFs that are invalid, are also corrupted on Survival Library's website.
* Meaning it's the _source_ that's corrupt, not the downloaded file specifically.
*/
const scriptOutput = pdfUrls.map(url => {
const filename = url.split('/').pop();
return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`;
});
await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));
}
parseHtml();