Add README, minor tweaks and comments to parse_html_pages.js
This commit is contained in:
parent
b34dfce7c5
commit
2645ba7e5c
17
README.md
Normal file
17
README.md
Normal file
@ -0,0 +1,17 @@
|
||||
# Survival Library
|
||||
|
||||
Various scripts for scraping and parsing survivallibrary.com
|
||||
|
||||
Keep in mind it was meant to be a quick-and-dirty project, so things were kind of hotglued together as I went along.
|
||||
|
||||
## Requirements
|
||||
|
||||
1. Node.js + npm for `parse_html_pages.js`
|
||||
- I was using `v16.13.2` (LTS) at the time of writing.
|
||||
- Remember to run `npm install` before attempting to run `node parse_html_pages.js`
|
||||
2. `pdfinfo` via `poppler-utils`
|
||||
- Used by one of the Bash scripts to validate the downloaded PDF files
|
||||
3. Bash for the various scripts
|
||||
- Bash scripts were used on a Debian 10 (Buster) machine, which has it by default. Theoretically they should work on Windows (e.g. via Git Bash), but due to requirement #2 it might not work as expected.
|
||||
4. `curl` - which downloads all the pages.
|
||||
5.
|
@ -14,6 +14,11 @@ async function parseHtml()
|
||||
const folders = [];
|
||||
const folderLinks = {};
|
||||
|
||||
const specialFolderCases = {
|
||||
AEROPLANES: 'Aeroplanes',
|
||||
ACCOUNTING: 'Accounting',
|
||||
};
|
||||
|
||||
for (const page of pages)
|
||||
{
|
||||
const html = await fs.readFile(`./pages/${page}`, 'utf8');
|
||||
@ -29,12 +34,8 @@ async function parseHtml()
|
||||
.replaceAll(/\s{2,}/g, ' ');
|
||||
|
||||
// A few special cases, let's just handle these explicitly
|
||||
if (folderName === 'AEROPLANES') {
|
||||
folderName = 'Aeroplanes';
|
||||
}
|
||||
|
||||
if (folderName === 'ACCOUNTING') {
|
||||
folderName = 'Accounting';
|
||||
if (folderName in specialFolderCases) {
|
||||
folderName = specialFolderCases[folderName];
|
||||
}
|
||||
|
||||
if (!folders.includes(folderName)) {
|
||||
@ -77,6 +78,17 @@ async function parseHtml()
|
||||
* We create a script that hardlinks each file to the correct categories.
|
||||
*
|
||||
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
|
||||
*
|
||||
* Since the files are *hardlinks*, we don't need to worry about using extra space.
|
||||
* You can safely delete the original, "unsorted" files and they will still be saved.
|
||||
* However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted.
|
||||
*
|
||||
* Anyways, this method is kind of janky. We're basically generating a bunch of shell commands:
|
||||
* - Create a `Sorted` folder
|
||||
* - Create a `Sorted/<category>` folder
|
||||
* - Hardlink the unsorted `<filename.pdf>` to `Sorted/<category>/<filename.pdf>`
|
||||
*
|
||||
* If it works, it ain't stupid ¯\_(ツ)_/¯
|
||||
*/
|
||||
let folderLinkCmds = ['mkdir -p Sorted'];
|
||||
for (const folder of folders)
|
||||
@ -102,8 +114,9 @@ async function parseHtml()
|
||||
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
|
||||
* before skipping it (if it does exist).
|
||||
*
|
||||
* An alternative would be to attempt to parse each PDF locally
|
||||
* to verify that it's at least a valid PDF.
|
||||
* As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid.
|
||||
* Keep in mind that most of the PDFs that are invalid, are also corrupted on Survival Library's website.
|
||||
* Meaning it's the _source_ that's corrupt, not the downloaded file specifically.
|
||||
*/
|
||||
const scriptOutput = pdfUrls.map(url => {
|
||||
const filename = url.split('/').pop();
|
||||
|
Loading…
Reference in New Issue
Block a user