Add more scripts and tools

This commit is contained in:
Alex Thomassen 2022-02-07 10:30:15 +01:00
parent d272abc4d1
commit b34dfce7c5
Signed by: Alex
GPG Key ID: 10BD786B5F6FF5DE
8 changed files with 151 additions and 163 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
node_modules
pdfUrls.*
folderLink.sh
folders.*

27
broken_pdf_remove.sh Normal file
View File

@ -0,0 +1,27 @@
#!/bin/bash
DRY_RUN=1;
if [[ "$1" == "delete" ]]; then
DRY_RUN=0;
else
echo "Dry run. To delete broken PDFs, run with 'delete' as the first argument.";
fi
for f in $(cat broken_pdfs.txt);
do
files="$(find . -name "$f" -print)";
if [[ "$files" == "" ]]; then
continue;
fi
for file in "$files";
do
if [[ $DRY_RUN != 0 ]]; then
echo "Would delete $file";
else
echo "Deleting $file";
rm "$file";
fi
done
done

17
broken_pdfs.txt Normal file
View File

@ -0,0 +1,17 @@
advanced_civics-the_spirit_the_form_and_the_functions_of_the_american_govt_1905.pdf
a_text_book_of_civics_for_the_state_of_washington_1910.pdf
farm_friends_and_farm_foes-a_text-book_of_agricultural_science_1910.pdf
hygiene_of_the_boot_and_shoe_industry_in_massachusetts_1912.pdf
nuclear_war_survival_skills_september_1979.pdf
savage_cub_bolt_action_rifle.pdf
scientific-american-1853-05-07-v08-n34.pdf
scientific-american-1853-09-10-v08-n52.pdf
scientific-american-1856-06-07-v11-n39.pdf
scientific-american-1857-10-31-v13-n08.pdf
scientific-american-1857-12-26-v13-n16.pdf
scientific-american-1858-02-13-v13-n23.pdf
scientific-american-1858-09-04-v13-n52.pdf
scientific-american-1874-09-12-v31-n11.pdf
scientific-american-1876-01-29-v34-n06.pdf
the_telescope-nott_1832.pdf
youths_book_of_astronomy_1838.pdf

13
get_page_urls_browser.js Normal file
View File

@ -0,0 +1,13 @@
// I used this script to copy all the pages/categories to my clipboard
// Open http://www.survivorlibrary.com/library-download.html, then open developer tools.
// Copy/paste the script below
const table = document.querySelector('tbody');
const links = table.querySelectorAll('a');
const linkList = Array.from(links)
.map(link => link.href)
.join('\n');
// Only works in the browser console from what I know. Might even be a Firefox-only thing.
// Copies specified text to clipboard
copy(linkList);

View File

@ -1,5 +1,12 @@
#!/bin/bash
# The script will basically just request the HTML page of each URL
# Dump it into individual files in the `pages` directory
# The pages are requested with a 1.5 second pause in-between.
# So it'll take roughly 6 minutes to grab all the pages.
#
# The reason all the pages are dumped is so that we can process them "offline" in peace,
# without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked).
OUTPUT_DIR="pages";
mkdir -p "${OUTPUT_DIR}";

View File

@ -1,161 +0,0 @@
http://www.survivorlibrary.com/index.php/8-category/5-new-additions
http://www.survivorlibrary.com/index.php/8-category/3-library-accounting
http://www.survivorlibrary.com/index.php/8-category/43-library-engineering-electrical
http://www.survivorlibrary.com/index.php/8-category/79-library-livestock-cattle
http://www.survivorlibrary.com/index.php/8-category/115-library-railroads
http://www.survivorlibrary.com/index.php/8-category/4-library-aeroplanes
http://www.survivorlibrary.com/index.php/8-category/44-library-engineering-general
http://www.survivorlibrary.com/index.php/8-category/80-library-livestock-rabbits%20and%20cavies
http://www.survivorlibrary.com/index.php/8-category/171-library-rat-control
http://www.survivorlibrary.com/index.php/8-category/6-library-airships
http://www.survivorlibrary.com/index.php/8-category/45-library-engineering-hydraulics
http://www.survivorlibrary.com/index.php/8-category/81-library-livestock-sheep
http://www.survivorlibrary.com/index.php/8-category/116-library-refrigeration
http://www.survivorlibrary.com/index.php/8-category/7-library-archery
http://www.survivorlibrary.com/index.php/8-category/46-library-engraving%20and%20woodcuts
http://www.survivorlibrary.com/index.php/8-category/82-library-livestock-swine
http://www.survivorlibrary.com/index.php/8-category/117-library-sanitation
http://www.survivorlibrary.com/index.php/8-category/8-library-architecture
http://www.survivorlibrary.com/index.php/8-category/47-library-ethics
http://www.survivorlibrary.com/index.php/8-category/83-library-machine%20tools
http://www.survivorlibrary.com/index.php/8-category/120-library-sewage
http://www.survivorlibrary.com/index.php/8-category/9-library-astronomy
http://www.survivorlibrary.com/index.php/8-category/48-library-farming
http://www.survivorlibrary.com/index.php/8-category/169-library-machinerys-reference
http://www.survivorlibrary.com/index.php/8-category/121-library-sewing
http://www.survivorlibrary.com/index.php/8-category/10-library-baking
http://www.survivorlibrary.com/index.php/8-category/179-library-farming2
http://www.survivorlibrary.com/index.php/8-category/172-library-mathematics
http://www.survivorlibrary.com/index.php/8-category/122-library-shelter
http://www.survivorlibrary.com/index.php/8-category/11-library-banking
http://www.survivorlibrary.com/index.php/8-category/49-library-farming-corn
http://www.survivorlibrary.com/index.php/8-category/84-library-mechanical%20drawing
http://www.survivorlibrary.com/index.php/8-category/123-library-shipbuilding
http://www.survivorlibrary.com/index.php/8-category/12-library-basketry
http://www.survivorlibrary.com/index.php/8-category/50-library-farming-fish
http://www.survivorlibrary.com/index.php/8-category/85-library-medical%20courses-us%20army
http://www.survivorlibrary.com/index.php/8-category/124-library-shoemaking
http://www.survivorlibrary.com/index.php/8-category/13-library-bee%20journal%20(american)
http://www.survivorlibrary.com/index.php/8-category/167-library-farming-grapes_wine_raisins
http://www.survivorlibrary.com/index.php/8-category/86-library-medical-anesthesia
http://www.survivorlibrary.com/index.php/8-category/125-library-shorthand
http://www.survivorlibrary.com/index.php/8-category/14-library-bee%20journal%20(british)
http://www.survivorlibrary.com/index.php/8-category/51-library-farming-potato%20and%20sweet%20potato
http://www.survivorlibrary.com/index.php/8-category/87-library-medical-diagnostics
http://www.survivorlibrary.com/index.php/8-category/126-library-silk%20culture
http://www.survivorlibrary.com/index.php/8-category/15-library-beekeeping
http://www.survivorlibrary.com/index.php/8-category/52-library-firearms-books
http://www.survivorlibrary.com/index.php/8-category/88-library-medical-emergency
http://www.survivorlibrary.com/index.php/8-category/127-library-sliderules%20and%20abacus
http://www.survivorlibrary.com/index.php/8-category/16-library-beekeeping%202
http://www.survivorlibrary.com/index.php/8-category/53-library-firearms-manuals
http://www.survivorlibrary.com/index.php/8-category/168-library-medical-hypnotism
http://www.survivorlibrary.com/index.php/8-category/128-library-smithing
http://www.survivorlibrary.com/index.php/8-category/17-library-berries
http://www.survivorlibrary.com/index.php/8-category/54-library-fishing
http://www.survivorlibrary.com/index.php/8-category/89-library-medical-medicine%201900-1922
http://www.survivorlibrary.com/index.php/8-category/129-library-steam%20engines
http://www.survivorlibrary.com/index.php/8-category/18-library-boilermaker
http://www.survivorlibrary.com/index.php/8-category/55-library-food
http://www.survivorlibrary.com/index.php/8-category/90-library-medical-microscopy
http://www.survivorlibrary.com/index.php/8-category/130-library-stone%20and%20masonry
http://www.survivorlibrary.com/index.php/8-category/19-library-bookbinding
http://www.survivorlibrary.com/index.php/8-category/56-library-forestry
http://www.survivorlibrary.com/index.php/8-category/91-library-medical-nursing%201900-1921
http://www.survivorlibrary.com/index.php/8-category/131-library-surveying
http://www.survivorlibrary.com/index.php/8-category/181-library-books%20for%20boys%20and%20girls
http://www.survivorlibrary.com/index.php/8-category/57-library-forging%20and%20casting
http://www.survivorlibrary.com/index.php/8-category/92-library-medical-obstetrics%201900-1922
http://www.survivorlibrary.com/index.php/8-category/132-library-survival-individual
http://www.survivorlibrary.com/index.php/8-category/20-library-botany
http://www.survivorlibrary.com/index.php/8-category/58-library-formulas
http://www.survivorlibrary.com/index.php/8-category/93-library-medical-surgery%201900-1922
http://www.survivorlibrary.com/index.php/8-category/133-library-teaching
http://www.survivorlibrary.com/index.php/8-category/21-library-boy%20scout%20manuals
http://www.survivorlibrary.com/index.php/8-category/175-library-fuels
http://www.survivorlibrary.com/index.php/8-category/94-library-medical-surgery%202
http://www.survivorlibrary.com/index.php/8-category/134-library-teaching-arithmetic
http://www.survivorlibrary.com/index.php/8-category/22-library-brewing%20and%20distilling
http://www.survivorlibrary.com/index.php/8-category/59-library-geodesy
http://www.survivorlibrary.com/index.php/8-category/95-library-medical-x-rays
http://www.survivorlibrary.com/index.php/8-category/135-library-teaching-civics
http://www.survivorlibrary.com/index.php/8-category/23-library-bridges%20and%20dams
http://www.survivorlibrary.com/index.php/8-category/60-library-geography
http://www.survivorlibrary.com/index.php/8-category/96-library-meteorology
http://www.survivorlibrary.com/index.php/8-category/136-library-teaching-phonics
http://www.survivorlibrary.com/index.php/8-category/24-library-butchering
http://www.survivorlibrary.com/index.php/8-category/61-library-glassmaking
http://www.survivorlibrary.com/index.php/8-category/180-library-mimeograph
http://www.survivorlibrary.com/index.php/8-category/137-library-teaching-readers
http://www.survivorlibrary.com/index.php/8-category/25-library-canning
http://www.survivorlibrary.com/index.php/8-category/62-library-gunpowder%20and%20explosives
http://www.survivorlibrary.com/index.php/8-category/97-library-misc
http://www.survivorlibrary.com/index.php/8-category/138-library-teaching-readers-mcguffey
http://www.survivorlibrary.com/index.php/8-category/26-library-cheese%20and%20butter%20making
http://www.survivorlibrary.com/index.php/8-category/63-library-hatmaking
http://www.survivorlibrary.com/index.php/8-category/98-library-monasticism
http://www.survivorlibrary.com/index.php/8-category/139-library-telegraph%20and%20telephone
http://www.survivorlibrary.com/index.php/8-category/27-library-chemistry
http://www.survivorlibrary.com/index.php/8-category/64-library-heating
http://www.survivorlibrary.com/index.php/8-category/99-library-morality
http://www.survivorlibrary.com/index.php/8-category/140-library-thanksgiving
http://www.survivorlibrary.com/index.php/8-category/28-library-christmas
http://www.survivorlibrary.com/index.php/8-category/65-library-heavy%20and%20industrial%20machinery
http://www.survivorlibrary.com/index.php/8-category/100-library-mushrooms
http://www.survivorlibrary.com/index.php/8-category/141-library-tobacco
http://www.survivorlibrary.com/index.php/8-category/29-library-clockmaking
http://www.survivorlibrary.com/index.php/8-category/66-library-hemp%20and%20flax
http://www.survivorlibrary.com/index.php/8-category/101-library-musical%20instruments
http://www.survivorlibrary.com/index.php/8-category/142-library-toys
http://www.survivorlibrary.com/index.php/8-category/30-library-coal%20and%20mining
http://www.survivorlibrary.com/index.php/8-category/67-library-herbalism
http://www.survivorlibrary.com/index.php/8-category/102-library-navigation
http://www.survivorlibrary.com/index.php/8-category/143-library-trapping%20and%20hunting
http://www.survivorlibrary.com/index.php/8-category/31-library-coffee-tea
http://www.survivorlibrary.com/index.php/8-category/68-library-history-american
http://www.survivorlibrary.com/index.php/8-category/103-library-nbc
http://www.survivorlibrary.com/index.php/8-category/144-library-turpentine,%20glue,%20solvents
http://www.survivorlibrary.com/index.php/8-category/32-library-conduct%20of%20life
http://www.survivorlibrary.com/index.php/8-category/69-library-home%20economics
http://www.survivorlibrary.com/index.php/8-category/104-library-opium
http://www.survivorlibrary.com/index.php/8-category/145-library-veterinary
http://www.survivorlibrary.com/index.php/8-category/33-library-construction
http://www.survivorlibrary.com/index.php/8-category/70-library-horses
http://www.survivorlibrary.com/index.php/8-category/105-library-optometry
http://www.survivorlibrary.com/index.php/8-category/146-library-wagon%20and%20coach%20building
http://www.survivorlibrary.com/index.php/8-category/34-library-cooking%20and%20cookbooks
http://www.survivorlibrary.com/index.php/8-category/71-library-journalism
http://www.survivorlibrary.com/index.php/8-category/106-library-painting%20and%20drawing
http://www.survivorlibrary.com/index.php/8-category/147-library-weaving
http://www.survivorlibrary.com/index.php/8-category/35-library-cotton
http://www.survivorlibrary.com/index.php/8-category/72-library-knitting-lace-needlepoint
http://www.survivorlibrary.com/index.php/8-category/107-library-papermaking
http://www.survivorlibrary.com/index.php/8-category/148-library-welding
http://www.survivorlibrary.com/index.php/8-category/36-library-cycles%20(bi-tri-motor)
http://www.survivorlibrary.com/index.php/8-category/73-library-laundry
http://www.survivorlibrary.com/index.php/8-category/108-library-photography
http://www.survivorlibrary.com/index.php/8-category/149-library-wind%20and%20water
http://www.survivorlibrary.com/index.php/8-category/37-library-dentistry
http://www.survivorlibrary.com/index.php/8-category/173-library-law
http://www.survivorlibrary.com/index.php/8-category/109-library-pottery
http://www.survivorlibrary.com/index.php/8-category/151-library-wood-carpentry
http://www.survivorlibrary.com/index.php/8-category/38-library-drilling
http://www.survivorlibrary.com/index.php/8-category/74-library-leather
http://www.survivorlibrary.com/index.php/8-category/110-library-poultry
http://www.survivorlibrary.com/index.php/8-category/152-library-wood-carving
http://www.survivorlibrary.com/index.php/8-category/39-library-economics
http://www.survivorlibrary.com/index.php/8-category/75-library-leisure-games%20and%20sports
http://www.survivorlibrary.com/index.php/8-category/111-library-primers
http://www.survivorlibrary.com/index.php/8-category/153-library-wood-furniture
http://www.survivorlibrary.com/index.php/8-category/40-library-embalming
http://www.survivorlibrary.com/index.php/8-category/76-library-leisure-recreation%20magazine
http://www.survivorlibrary.com/index.php/8-category/112-library-printing
http://www.survivorlibrary.com/index.php/8-category/154-library-work%20magazine
http://www.survivorlibrary.com/index.php/8-category/41-library-encyclopedias
http://www.survivorlibrary.com/index.php/8-category/77-library-leisure-whist
http://www.survivorlibrary.com/index.php/8-category/113-library-radio
http://www.survivorlibrary.com/index.php/8-category/118-library-scientific%20american%20(series%201)
http://www.survivorlibrary.com/index.php/8-category/42-library-engineering-drainage
http://www.survivorlibrary.com/index.php/8-category/78-library-lithography
http://www.survivorlibrary.com/index.php/8-category/114-library-radio%2073%20magazine
http://www.survivorlibrary.com/index.php/8-category/119-library-scientific%20american%20(series%202)

View File

@ -1,10 +1,18 @@
const { parse } = require('node-html-parser');
const fs = require('fs/promises');
/**
* This is a really hacky and ugly method, but essentially we're relying on Node.js to
* parse the HTML and grab all the PDF links.
*
* To be honest, this could've been done with some fancy regex or whatever, but who cares.
*/
async function parseHtml()
{
const pages = await fs.readdir('./pages');
const pdfUrls = [];
const folders = [];
const folderLinks = {};
for (const page of pages)
{
@ -12,6 +20,28 @@ async function parseHtml()
const root = parse(html);
const potentialUrls = root.querySelectorAll('a');
let folderName = root.querySelector('title').textContent;
// Normalize the folder name
folderName = folderName.replaceAll('/', '_')
.replace('Library-', '')
.replace('LIBRARY-', '')
.trim()
.replaceAll(/\s{2,}/g, ' ');
// A few special cases, let's just handle these explicitly
if (folderName === 'AEROPLANES') {
folderName = 'Aeroplanes';
}
if (folderName === 'ACCOUNTING') {
folderName = 'Accounting';
}
if (!folders.includes(folderName)) {
folders.push(folderName);
folderLinks[folderName] = [];
}
let validUrls = 0;
for (const anchor of potentialUrls)
@ -24,7 +54,10 @@ async function parseHtml()
continue;
}
// All URLs in the `href` attributes are relative, so we fix em up.
const fullUrl = `http://www.survivorlibrary.com${url}`;
const filename = url.split('/').pop();
folderLinks[folderName].push(filename);
// Duplicate
if (pdfUrls.includes(fullUrl)) {
@ -39,12 +72,42 @@ async function parseHtml()
console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`);
}
/**
* Since the `pdfUrls.sh` script just downloads all the PDFs into one directory,
* We create a script that hardlinks each file to the correct categories.
*
* There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo.
*/
let folderLinkCmds = ['mkdir -p Sorted'];
for (const folder of folders)
{
const links = folderLinks[folder];
folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`);
for (const filename of links)
{
folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`);
}
}
await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls));
await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n'));
await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4));
await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n'));
/**
* It seems the web server for SurvivalLibrary doesn't support
* the `Range` HTTP header. We can't just "continue" a download.
*
* I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_
* before skipping it (if it does exist).
*
* An alternative would be to attempt to parse each PDF locally
* to verify that it's at least a valid PDF.
*/
const scriptOutput = pdfUrls.map(url => {
const filename = url.split('/').pop();
return `[ ! -f "${filename}" ] && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - "${url}"`;
return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`;
});
await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n'));

20
validate_pdfs.sh Normal file
View File

@ -0,0 +1,20 @@
#!/bin/bash
# Requires you install `poppler-utils` for the `pdfinfo` command
# On Debian/Ubuntu it should be available as `poppler-utils`
# List of broken PDFs are outputted to the `validate.log` file.
# Most of the code provided by a StackOverflow answer: https://superuser.com/a/580895
# I highly recommend manually verifying the PDFs that are considered "broken" by this script.
# When running this script, it found 19 "broken" PDFs.
# 1 of them wasn't broken. The other 18 were.
# 1 of the remaining 18 was just a corrupted download,
# but the other 17 were actually broken and didn't even work when attempting to load from survivorlibrary.com
for f in *.pdf; do
if ! pdfinfo "$f" &> /dev/null; then
now="$(date +"%Y-%m-%d %H:%M:%S")";
echo "[$now] $f is broken" >> validate.log;
fi
done