116 lines
4.1 KiB
JavaScript
116 lines
4.1 KiB
JavaScript
const { parse } = require('node-html-parser');
|
|
const fs = require('fs/promises');
|
|
|
|
/**
|
|
* This is a really hacky and ugly method, but essentially we're relying on Node.js to
|
|
* parse the HTML and grab all the PDF links.
|
|
*
|
|
* To be honest, this could've been done with some fancy regex or whatever, but who cares.
|
|
*/
|
|
async function parseHtml()
|
|
{
|
|
const pages = await fs.readdir('./pages');
|
|
const pdfUrls = [];
|
|
const folders = [];
|
|
const folderLinks = {};
|
|
|
|
for (const page of pages)
|
|
{
|
|
const html = await fs.readFile(`./pages/${page}`, 'utf8');
|
|
const root = parse(html);
|
|
const potentialUrls = root.querySelectorAll('a');
|
|
|
|
let folderName = root.querySelector('title').textContent;
|
|
// Normalize the folder name
|
|
folderName = folderName.replaceAll('/', '_')
|
|
.replace('Library-', '')
|
|
.replace('LIBRARY-', '')
|
|
.trim()
|
|
.replaceAll(/\s{2,}/g, ' ');
|
|
|
|
// A few special cases, let's just handle these explicitly
|
|
if (folderName === 'AEROPLANES') {
|
|
folderName = 'Aeroplanes';
|
|
}
|
|
|
|
if (folderName === 'ACCOUNTING') {
|
|
folderName = 'Accounting';
|
|
}
|
|
|
|
if (!folders.includes(folderName)) {
|
|
folders.push(folderName);
|
|
folderLinks[folderName] = [];
|
|
}
|
|
|
|
let validUrls = 0;
|
|
|
|
for (const anchor of potentialUrls)
|
|
{
|
|
const url = anchor.getAttribute('href') || '';
|
|
|
|
// URL found, but it's not a PDF
|
|
if (!url.toLowerCase().endsWith('.pdf')) {
|
|
// console.error(`${url} is not a pdf`);
|
|
continue;
|
|
}
|
|
|
|
// All URLs in the `href` attributes are relative, so we fix em up.
|
|
const fullUrl = `http://www.survivorlibrary.com${url}`;
|
|
const filename = url.split('/').pop();
|
|
folderLinks[folderName].push(filename);
|
|
|
|
// Duplicate
|
|
if (pdfUrls.includes(fullUrl)) {
|
|
continue;
|
|
}
|
|
|
|
// console.log(`Found PDF: ${fullUrl}`);
|
|
pdfUrls.push(fullUrl);
|
|
validUrls++;
|
|
}
|
|
|
|
console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
|
|
}
|
|
|
|
/**
|
|
* Since the `pdfUrls.sh` script just downloads all the PDFs into one directory,
|
|
* We create a script that hardlinks each file to the correct categories.
|
|
*
|
|
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
|
|
*/
|
|
let folderLinkCmds = ['mkdir -p Sorted'];
|
|
for (const folder of folders)
|
|
{
|
|
const links = folderLinks[folder];
|
|
folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`);
|
|
|
|
for (const filename of links)
|
|
{
|
|
folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`);
|
|
}
|
|
}
|
|
|
|
await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
|
|
await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
|
|
await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4));
|
|
await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n'));
|
|
|
|
/**
|
|
* It seems the web server for SurvivalLibrary doesn't support
|
|
* the `Range` HTTP header. We can't just "continue" a download.
|
|
*
|
|
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
|
|
* before skipping it (if it does exist).
|
|
*
|
|
* An alternative would be to attempt to parse each PDF locally
|
|
* to verify that it's at least a valid PDF.
|
|
*/
|
|
const scriptOutput = pdfUrls.map(url => {
|
|
const filename = url.split('/').pop();
|
|
return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`;
|
|
});
|
|
|
|
await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));
|
|
}
|
|
|
|
parseHtml(); |