Add README, minor tweaks and comments to parse_html_pages.js
This commit is contained in:
parent
b34dfce7c5
commit
2645ba7e5c
17
README.md
Normal file
17
README.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Survival Library
|
||||||
|
|
||||||
|
Various scripts for scraping and parsing survivallibrary.com
|
||||||
|
|
||||||
|
Keep in mind it was meant to be a quick-and-dirty project, so things were kind of hotglued together as I went along.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
1. Node.js + npm for `parse_html_pages.js`
|
||||||
|
- I was using `v16.13.2` (LTS) at the time of writing.
|
||||||
|
- Remember to run `npm install` before attempting to run `node parse_html_pages.js`
|
||||||
|
2. `pdfinfo` via `poppler-utils`
|
||||||
|
- Used by one of the Bash scripts to validate the downloaded PDF files
|
||||||
|
3. Bash for the various scripts
|
||||||
|
- Bash scripts were used on a Debian 10 (Buster) machine, which has it by default. Theoretically they should work on Windows (e.g. via Git Bash), but due to requirement #2 it might not work as expected.
|
||||||
|
4. `curl` - which downloads all the pages.
|
||||||
|
5.
|
@ -14,6 +14,11 @@ async function parseHtml()
|
|||||||
const folders = [];
|
const folders = [];
|
||||||
const folderLinks = {};
|
const folderLinks = {};
|
||||||
|
|
||||||
|
const specialFolderCases = {
|
||||||
|
AEROPLANES: 'Aeroplanes',
|
||||||
|
ACCOUNTING: 'Accounting',
|
||||||
|
};
|
||||||
|
|
||||||
for (const page of pages)
|
for (const page of pages)
|
||||||
{
|
{
|
||||||
const html = await fs.readFile(`./pages/${page}`, 'utf8');
|
const html = await fs.readFile(`./pages/${page}`, 'utf8');
|
||||||
@ -29,12 +34,8 @@ async function parseHtml()
|
|||||||
.replaceAll(/\s{2,}/g, ' ');
|
.replaceAll(/\s{2,}/g, ' ');
|
||||||
|
|
||||||
// A few special cases, let's just handle these explicitly
|
// A few special cases, let's just handle these explicitly
|
||||||
if (folderName === 'AEROPLANES') {
|
if (folderName in specialFolderCases) {
|
||||||
folderName = 'Aeroplanes';
|
folderName = specialFolderCases[folderName];
|
||||||
}
|
|
||||||
|
|
||||||
if (folderName === 'ACCOUNTING') {
|
|
||||||
folderName = 'Accounting';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!folders.includes(folderName)) {
|
if (!folders.includes(folderName)) {
|
||||||
@ -77,6 +78,17 @@ async function parseHtml()
|
|||||||
* We create a script that hardlinks each file to the correct categories.
|
* We create a script that hardlinks each file to the correct categories.
|
||||||
*
|
*
|
||||||
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
|
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
|
||||||
|
*
|
||||||
|
* Since the files are *hardlinks*, we don't need to worry about using extra space.
|
||||||
|
* You can safely delete the original, "unsorted" files and they will still be saved.
|
||||||
|
* However, if you _modify_ (e.g. edit the PDF) the file, it will affect the one that has been sorted.
|
||||||
|
*
|
||||||
|
* Anyways, this method is kind of janky. We're basically generating a bunch of shell commands:
|
||||||
|
* - Create a `Sorted` folder
|
||||||
|
* - Create a `Sorted/<category>` folder
|
||||||
|
* - Hardlink the unsorted `<filename.pdf>` to `Sorted/<category>/<filename.pdf>`
|
||||||
|
*
|
||||||
|
* If it works, it ain't stupid ¯\_(ツ)_/¯
|
||||||
*/
|
*/
|
||||||
let folderLinkCmds = ['mkdir -p Sorted'];
|
let folderLinkCmds = ['mkdir -p Sorted'];
|
||||||
for (const folder of folders)
|
for (const folder of folders)
|
||||||
@ -102,8 +114,9 @@ async function parseHtml()
|
|||||||
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
|
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
|
||||||
* before skipping it (if it does exist).
|
* before skipping it (if it does exist).
|
||||||
*
|
*
|
||||||
* An alternative would be to attempt to parse each PDF locally
|
* As a workaround, I created `validate_pdfs.sh` to at least validate that the PDFs are valid.
|
||||||
* to verify that it's at least a valid PDF.
|
* Keep in mind that most of the PDFs that are invalid, are also corrupted on Survival Library's website.
|
||||||
|
* Meaning it's the _source_ that's corrupt, not the downloaded file specifically.
|
||||||
*/
|
*/
|
||||||
const scriptOutput = pdfUrls.map(url => {
|
const scriptOutput = pdfUrls.map(url => {
|
||||||
const filename = url.split('/').pop();
|
const filename = url.split('/').pop();
|
||||||
|
Loading…
Reference in New Issue
Block a user