Add README, minor tweaks and comments to parse_html_pages.js

2022-02-07 10:56:01 +01:00 · 2022-02-07 10:56:01 +01:00 · 2645ba7e5c
commit 2645ba7e5c
parent b34dfce7c5
2 changed files with 38 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,17 @@
+# Survival Library
+
+Various scripts for scraping and parsing survivallibrary.com
+
+Keep in mind it was meant to be a quick-and-dirty project, so things were kind of hotglued together as I went along.
+
+## Requirements
+
+1. Node.js + npm for `parse_html_pages.js`
+    - I was using `v16.13.2` (LTS) at the time of writing.
+    - Remember to run `npm install` before attempting to run `node parse_html_pages.js`
+2. `pdfinfo` via `poppler-utils`
+    - Used by one of the Bash scripts to validate the downloaded PDF files
+3. Bash for the various scripts
+    - Bash scripts were used on a Debian 10 (Buster) machine, which has it by default. Theoretically they should work on Windows (e.g. via Git Bash), but due to requirement #2 it might not work as expected.
+4. `curl` - which downloads all the pages.
+5. 
--- a/parse_html_pages.js
+++ b/parse_html_pages.js
@ -14,6 +14,11 @@ async function parseHtml()
    const folders = [];
    const folderLinks = {};

+    const specialFolderCases = {
+        AEROPLANES: 'Aeroplanes',
+        ACCOUNTING: 'Accounting',
+    };
+
    for (const page of pages)
    {
        const html = await fs.readFile(`./pages/${page}`, 'utf8');
@ -29,12 +34,8 @@ async function parseHtml()
                               .replaceAll(/\s{2,}/g, ' ');
        
        // A few special cases, let's just handle these explicitly
-        if (folderName === 'AEROPLANES') {
-            folderName = 'Aeroplanes';
-        }
-
-        if (folderName === 'ACCOUNTING') {
-            folderName = 'Accounting';
+        if (folderName in specialFolderCases) {
+            folderName = specialFolderCases[folderName];
        }

        if (!folders.includes(folderName)) {
@ -77,6 +78,17 @@ async function parseHtml()
     * We create a script that hardlinks each file to the correct categories.
     * 
     * There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
+     * 
+     * Since the files are *hardlinks*, we don't need to worry about using extra space.
+     * You can safely delete the original, "unsorted" files and they will still be saved.
+     * However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted.
+     * 
+     * Anyways, this method is kind of janky. We're basically generating a bunch of shell commands:
+     * - Create a `Sorted` folder
+     * - Create a `Sorted/<category>` folder
+     * - Hardlink the unsorted `<filename.pdf>` to `Sorted/<category>/<filename.pdf>`
+     * 
+     * If it works, it ain't stupid ¯\_(ツ)_/¯
     */
    let folderLinkCmds = ['mkdir -p Sorted'];
    for (const folder of folders)
@ -102,8 +114,9 @@ async function parseHtml()
     * I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
     * before skipping it (if it does exist).
     * 
-     * An alternative would be to attempt to parse each PDF locally
-     * to verify that it's at least a valid PDF.
+     * As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid.
+     * Keep in mind that most of the PDFs that are invalid, are also corrupted on Survival Library's website.
+     * Meaning it's the _source_ that's corrupt, not the downloaded file specifically.
     */
    const scriptOutput = pdfUrls.map(url => {
        const filename = url.split('/').pop();