129 lines
5.0 KiB
JavaScript
129 lines
5.0 KiB
JavaScript
const { parse } = require('node-html-parser');
|
|
const fs = require('fs/promises');
|
|
|
|
/**
|
|
* This is a really hacky and ugly method, but essentially we're relying on Node.js to
|
|
* parse the HTML and grab all the PDF links.
|
|
*
|
|
* To be honest, this could've been done with some fancy regex or whatever, but who cares.
|
|
*/
|
|
async function parseHtml()
|
|
{
|
|
const pages = await fs.readdir('./pages');
|
|
const pdfUrls = [];
|
|
const folders = [];
|
|
const folderLinks = {};
|
|
|
|
const specialFolderCases = {
|
|
AEROPLANES: 'Aeroplanes',
|
|
ACCOUNTING: 'Accounting',
|
|
};
|
|
|
|
for (const page of pages)
|
|
{
|
|
const html = await fs.readFile(`./pages/${page}`, 'utf8');
|
|
const root = parse(html);
|
|
const potentialUrls = root.querySelectorAll('a');
|
|
|
|
let folderName = root.querySelector('title').textContent;
|
|
// Normalize the folder name
|
|
folderName = folderName.replaceAll('/', '_')
|
|
.replace('Library-', '')
|
|
.replace('LIBRARY-', '')
|
|
.trim()
|
|
.replaceAll(/\s{2,}/g, ' ');
|
|
|
|
// A few special cases, let's just handle these explicitly
|
|
if (folderName in specialFolderCases) {
|
|
folderName = specialFolderCases[folderName];
|
|
}
|
|
|
|
if (!folders.includes(folderName)) {
|
|
folders.push(folderName);
|
|
folderLinks[folderName] = [];
|
|
}
|
|
|
|
let validUrls = 0;
|
|
|
|
for (const anchor of potentialUrls)
|
|
{
|
|
const url = anchor.getAttribute('href') || '';
|
|
|
|
// URL found, but it's not a PDF
|
|
if (!url.toLowerCase().endsWith('.pdf')) {
|
|
// console.error(`${url} is not a pdf`);
|
|
continue;
|
|
}
|
|
|
|
// All URLs in the `href` attributes are relative, so we fix em up.
|
|
const fullUrl = `http://www.survivorlibrary.com${url}`;
|
|
const filename = url.split('/').pop();
|
|
folderLinks[folderName].push(filename);
|
|
|
|
// Duplicate
|
|
if (pdfUrls.includes(fullUrl)) {
|
|
continue;
|
|
}
|
|
|
|
// console.log(`Found PDF: ${fullUrl}`);
|
|
pdfUrls.push(fullUrl);
|
|
validUrls++;
|
|
}
|
|
|
|
console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
|
|
}
|
|
|
|
/**
|
|
* Since the `pdfUrls.sh` script just downloads all the PDFs into one directory,
|
|
* We create a script that hardlinks each file to the correct categories.
|
|
*
|
|
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
|
|
*
|
|
* Since the files are *hardlinks*, we don't need to worry about using extra space.
|
|
* You can safely delete the original, "unsorted" files and they will still be saved.
|
|
* However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted.
|
|
*
|
|
* Anyways, this method is kind of janky. We're basically generating a bunch of shell commands:
|
|
* - Create a `Sorted` folder
|
|
* - Create a `Sorted/<category>` folder
|
|
* - Hardlink the unsorted `<filename.pdf>` to `Sorted/<category>/<filename.pdf>`
|
|
*
|
|
* If it works, it ain't stupid ¯\_(ツ)_/¯
|
|
*/
|
|
let folderLinkCmds = ['mkdir -p Sorted'];
|
|
for (const folder of folders)
|
|
{
|
|
const links = folderLinks[folder];
|
|
folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`);
|
|
|
|
for (const filename of links)
|
|
{
|
|
folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`);
|
|
}
|
|
}
|
|
|
|
await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
|
|
await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
|
|
await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4));
|
|
await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n'));
|
|
|
|
/**
|
|
* It seems the web server for SurvivalLibrary doesn't support
|
|
* the `Range` HTTP header. We can't just "continue" a download.
|
|
*
|
|
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
|
|
* before skipping it (if it does exist).
|
|
*
|
|
* As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid.
|
|
* Keep in mind that most of the PDFs that are invalid, are also corrupted on Survival Library's website.
|
|
* Meaning it's the _source_ that's corrupt, not the downloaded file specifically.
|
|
*/
|
|
const scriptOutput = pdfUrls.map(url => {
|
|
const filename = url.split('/').pop();
|
|
return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`;
|
|
});
|
|
|
|
await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));
|
|
}
|
|
|
|
parseHtml(); |