From b34dfce7c5435a0734103ea623f646aa141a0092 Mon Sep 17 00:00:00 2001 From: Alex Thomassen Date: Mon, 7 Feb 2022 10:30:15 +0100 Subject: [PATCH] Add more scripts and tools --- .gitignore | 4 +- broken_pdf_remove.sh | 27 +++++++ broken_pdfs.txt | 17 +++++ get_page_urls_browser.js | 13 ++++ get_pages_with_pdfs.sh | 7 ++ page_list.txt | 161 --------------------------------------- parse_html_pages.js | 65 +++++++++++++++- validate_pdfs.sh | 20 +++++ 8 files changed, 151 insertions(+), 163 deletions(-) create mode 100644 broken_pdf_remove.sh create mode 100644 broken_pdfs.txt create mode 100644 get_page_urls_browser.js delete mode 100644 page_list.txt create mode 100644 validate_pdfs.sh diff --git a/.gitignore b/.gitignore index ed5d0d5..169ac35 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ node_modules -pdfUrls.* \ No newline at end of file +pdfUrls.* +folderLink.sh +folders.* \ No newline at end of file diff --git a/broken_pdf_remove.sh b/broken_pdf_remove.sh new file mode 100644 index 0000000..924cfa7 --- /dev/null +++ b/broken_pdf_remove.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +DRY_RUN=1; +if [[ "$1" == "delete" ]]; then + DRY_RUN=0; +else + echo "Dry run. To delete broken PDFs, run with 'delete' as the first argument."; +fi + +for f in $(cat broken_pdfs.txt); +do + files="$(find . -name "$f" -print)"; + + if [[ "$files" == "" ]]; then + continue; + fi + + for file in "$files"; + do + if [[ $DRY_RUN != 0 ]]; then + echo "Would delete $file"; + else + echo "Deleting $file"; + rm "$file"; + fi + done +done \ No newline at end of file diff --git a/broken_pdfs.txt b/broken_pdfs.txt new file mode 100644 index 0000000..0abbf45 --- /dev/null +++ b/broken_pdfs.txt @@ -0,0 +1,17 @@ +advanced_civics-the_spirit_the_form_and_the_functions_of_the_american_govt_1905.pdf +a_text_book_of_civics_for_the_state_of_washington_1910.pdf +farm_friends_and_farm_foes-a_text-book_of_agricultural_science_1910.pdf +hygiene_of_the_boot_and_shoe_industry_in_massachusetts_1912.pdf +nuclear_war_survival_skills_september_1979.pdf +savage_cub_bolt_action_rifle.pdf +scientific-american-1853-05-07-v08-n34.pdf +scientific-american-1853-09-10-v08-n52.pdf +scientific-american-1856-06-07-v11-n39.pdf +scientific-american-1857-10-31-v13-n08.pdf +scientific-american-1857-12-26-v13-n16.pdf +scientific-american-1858-02-13-v13-n23.pdf +scientific-american-1858-09-04-v13-n52.pdf +scientific-american-1874-09-12-v31-n11.pdf +scientific-american-1876-01-29-v34-n06.pdf +the_telescope-nott_1832.pdf +youths_book_of_astronomy_1838.pdf \ No newline at end of file diff --git a/get_page_urls_browser.js b/get_page_urls_browser.js new file mode 100644 index 0000000..22f4504 --- /dev/null +++ b/get_page_urls_browser.js @@ -0,0 +1,13 @@ +// I used this script to copy all the pages/categories to my clipboard +// Open http://www.survivorlibrary.com/library-download.html, then open developer tools. +// Copy/paste the script below + +const table = document.querySelector('tbody'); +const links = table.querySelectorAll('a'); +const linkList = Array.from(links) + .map(link => link.href) + .join('\n'); + +// Only works in the browser console from what I know. Might even be a Firefox-only thing. +// Copies specified text to clipboard +copy(linkList); \ No newline at end of file diff --git a/get_pages_with_pdfs.sh b/get_pages_with_pdfs.sh index 51cec47..1b7d225 100644 --- a/get_pages_with_pdfs.sh +++ b/get_pages_with_pdfs.sh @@ -1,5 +1,12 @@ #!/bin/bash +# The script will basically just request the HTML page of each URL +# Dump it into individual files in the `pages` directory +# The pages are requested with a 1.5 second pause in-between. +# So it'll take roughly 6 minutes to grab all the pages. +# +# The reason all the pages are dumped is so that we can process them "offline" in peace, +# without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked). OUTPUT_DIR="pages"; mkdir -p "${OUTPUT_DIR}"; diff --git a/page_list.txt b/page_list.txt deleted file mode 100644 index 4bf8c6d..0000000 --- a/page_list.txt +++ /dev/null @@ -1,161 +0,0 @@ -http://www.survivorlibrary.com/index.php/8-category/5-new-additions -http://www.survivorlibrary.com/index.php/8-category/3-library-accounting -http://www.survivorlibrary.com/index.php/8-category/43-library-engineering-electrical -http://www.survivorlibrary.com/index.php/8-category/79-library-livestock-cattle -http://www.survivorlibrary.com/index.php/8-category/115-library-railroads -http://www.survivorlibrary.com/index.php/8-category/4-library-aeroplanes -http://www.survivorlibrary.com/index.php/8-category/44-library-engineering-general -http://www.survivorlibrary.com/index.php/8-category/80-library-livestock-rabbits%20and%20cavies -http://www.survivorlibrary.com/index.php/8-category/171-library-rat-control -http://www.survivorlibrary.com/index.php/8-category/6-library-airships -http://www.survivorlibrary.com/index.php/8-category/45-library-engineering-hydraulics -http://www.survivorlibrary.com/index.php/8-category/81-library-livestock-sheep -http://www.survivorlibrary.com/index.php/8-category/116-library-refrigeration -http://www.survivorlibrary.com/index.php/8-category/7-library-archery -http://www.survivorlibrary.com/index.php/8-category/46-library-engraving%20and%20woodcuts -http://www.survivorlibrary.com/index.php/8-category/82-library-livestock-swine -http://www.survivorlibrary.com/index.php/8-category/117-library-sanitation -http://www.survivorlibrary.com/index.php/8-category/8-library-architecture -http://www.survivorlibrary.com/index.php/8-category/47-library-ethics -http://www.survivorlibrary.com/index.php/8-category/83-library-machine%20tools -http://www.survivorlibrary.com/index.php/8-category/120-library-sewage -http://www.survivorlibrary.com/index.php/8-category/9-library-astronomy -http://www.survivorlibrary.com/index.php/8-category/48-library-farming -http://www.survivorlibrary.com/index.php/8-category/169-library-machinerys-reference -http://www.survivorlibrary.com/index.php/8-category/121-library-sewing -http://www.survivorlibrary.com/index.php/8-category/10-library-baking -http://www.survivorlibrary.com/index.php/8-category/179-library-farming2 -http://www.survivorlibrary.com/index.php/8-category/172-library-mathematics -http://www.survivorlibrary.com/index.php/8-category/122-library-shelter -http://www.survivorlibrary.com/index.php/8-category/11-library-banking -http://www.survivorlibrary.com/index.php/8-category/49-library-farming-corn -http://www.survivorlibrary.com/index.php/8-category/84-library-mechanical%20drawing -http://www.survivorlibrary.com/index.php/8-category/123-library-shipbuilding -http://www.survivorlibrary.com/index.php/8-category/12-library-basketry -http://www.survivorlibrary.com/index.php/8-category/50-library-farming-fish -http://www.survivorlibrary.com/index.php/8-category/85-library-medical%20courses-us%20army -http://www.survivorlibrary.com/index.php/8-category/124-library-shoemaking -http://www.survivorlibrary.com/index.php/8-category/13-library-bee%20journal%20(american) -http://www.survivorlibrary.com/index.php/8-category/167-library-farming-grapes_wine_raisins -http://www.survivorlibrary.com/index.php/8-category/86-library-medical-anesthesia -http://www.survivorlibrary.com/index.php/8-category/125-library-shorthand -http://www.survivorlibrary.com/index.php/8-category/14-library-bee%20journal%20(british) -http://www.survivorlibrary.com/index.php/8-category/51-library-farming-potato%20and%20sweet%20potato -http://www.survivorlibrary.com/index.php/8-category/87-library-medical-diagnostics -http://www.survivorlibrary.com/index.php/8-category/126-library-silk%20culture -http://www.survivorlibrary.com/index.php/8-category/15-library-beekeeping -http://www.survivorlibrary.com/index.php/8-category/52-library-firearms-books -http://www.survivorlibrary.com/index.php/8-category/88-library-medical-emergency -http://www.survivorlibrary.com/index.php/8-category/127-library-sliderules%20and%20abacus -http://www.survivorlibrary.com/index.php/8-category/16-library-beekeeping%202 -http://www.survivorlibrary.com/index.php/8-category/53-library-firearms-manuals -http://www.survivorlibrary.com/index.php/8-category/168-library-medical-hypnotism -http://www.survivorlibrary.com/index.php/8-category/128-library-smithing -http://www.survivorlibrary.com/index.php/8-category/17-library-berries -http://www.survivorlibrary.com/index.php/8-category/54-library-fishing -http://www.survivorlibrary.com/index.php/8-category/89-library-medical-medicine%201900-1922 -http://www.survivorlibrary.com/index.php/8-category/129-library-steam%20engines -http://www.survivorlibrary.com/index.php/8-category/18-library-boilermaker -http://www.survivorlibrary.com/index.php/8-category/55-library-food -http://www.survivorlibrary.com/index.php/8-category/90-library-medical-microscopy -http://www.survivorlibrary.com/index.php/8-category/130-library-stone%20and%20masonry -http://www.survivorlibrary.com/index.php/8-category/19-library-bookbinding -http://www.survivorlibrary.com/index.php/8-category/56-library-forestry -http://www.survivorlibrary.com/index.php/8-category/91-library-medical-nursing%201900-1921 -http://www.survivorlibrary.com/index.php/8-category/131-library-surveying -http://www.survivorlibrary.com/index.php/8-category/181-library-books%20for%20boys%20and%20girls -http://www.survivorlibrary.com/index.php/8-category/57-library-forging%20and%20casting -http://www.survivorlibrary.com/index.php/8-category/92-library-medical-obstetrics%201900-1922 -http://www.survivorlibrary.com/index.php/8-category/132-library-survival-individual -http://www.survivorlibrary.com/index.php/8-category/20-library-botany -http://www.survivorlibrary.com/index.php/8-category/58-library-formulas -http://www.survivorlibrary.com/index.php/8-category/93-library-medical-surgery%201900-1922 -http://www.survivorlibrary.com/index.php/8-category/133-library-teaching -http://www.survivorlibrary.com/index.php/8-category/21-library-boy%20scout%20manuals -http://www.survivorlibrary.com/index.php/8-category/175-library-fuels -http://www.survivorlibrary.com/index.php/8-category/94-library-medical-surgery%202 -http://www.survivorlibrary.com/index.php/8-category/134-library-teaching-arithmetic -http://www.survivorlibrary.com/index.php/8-category/22-library-brewing%20and%20distilling -http://www.survivorlibrary.com/index.php/8-category/59-library-geodesy -http://www.survivorlibrary.com/index.php/8-category/95-library-medical-x-rays -http://www.survivorlibrary.com/index.php/8-category/135-library-teaching-civics -http://www.survivorlibrary.com/index.php/8-category/23-library-bridges%20and%20dams -http://www.survivorlibrary.com/index.php/8-category/60-library-geography -http://www.survivorlibrary.com/index.php/8-category/96-library-meteorology -http://www.survivorlibrary.com/index.php/8-category/136-library-teaching-phonics -http://www.survivorlibrary.com/index.php/8-category/24-library-butchering -http://www.survivorlibrary.com/index.php/8-category/61-library-glassmaking -http://www.survivorlibrary.com/index.php/8-category/180-library-mimeograph -http://www.survivorlibrary.com/index.php/8-category/137-library-teaching-readers -http://www.survivorlibrary.com/index.php/8-category/25-library-canning -http://www.survivorlibrary.com/index.php/8-category/62-library-gunpowder%20and%20explosives -http://www.survivorlibrary.com/index.php/8-category/97-library-misc -http://www.survivorlibrary.com/index.php/8-category/138-library-teaching-readers-mcguffey -http://www.survivorlibrary.com/index.php/8-category/26-library-cheese%20and%20butter%20making -http://www.survivorlibrary.com/index.php/8-category/63-library-hatmaking -http://www.survivorlibrary.com/index.php/8-category/98-library-monasticism -http://www.survivorlibrary.com/index.php/8-category/139-library-telegraph%20and%20telephone -http://www.survivorlibrary.com/index.php/8-category/27-library-chemistry -http://www.survivorlibrary.com/index.php/8-category/64-library-heating -http://www.survivorlibrary.com/index.php/8-category/99-library-morality -http://www.survivorlibrary.com/index.php/8-category/140-library-thanksgiving -http://www.survivorlibrary.com/index.php/8-category/28-library-christmas -http://www.survivorlibrary.com/index.php/8-category/65-library-heavy%20and%20industrial%20machinery -http://www.survivorlibrary.com/index.php/8-category/100-library-mushrooms -http://www.survivorlibrary.com/index.php/8-category/141-library-tobacco -http://www.survivorlibrary.com/index.php/8-category/29-library-clockmaking -http://www.survivorlibrary.com/index.php/8-category/66-library-hemp%20and%20flax -http://www.survivorlibrary.com/index.php/8-category/101-library-musical%20instruments -http://www.survivorlibrary.com/index.php/8-category/142-library-toys -http://www.survivorlibrary.com/index.php/8-category/30-library-coal%20and%20mining -http://www.survivorlibrary.com/index.php/8-category/67-library-herbalism -http://www.survivorlibrary.com/index.php/8-category/102-library-navigation -http://www.survivorlibrary.com/index.php/8-category/143-library-trapping%20and%20hunting -http://www.survivorlibrary.com/index.php/8-category/31-library-coffee-tea -http://www.survivorlibrary.com/index.php/8-category/68-library-history-american -http://www.survivorlibrary.com/index.php/8-category/103-library-nbc -http://www.survivorlibrary.com/index.php/8-category/144-library-turpentine,%20glue,%20solvents -http://www.survivorlibrary.com/index.php/8-category/32-library-conduct%20of%20life -http://www.survivorlibrary.com/index.php/8-category/69-library-home%20economics -http://www.survivorlibrary.com/index.php/8-category/104-library-opium -http://www.survivorlibrary.com/index.php/8-category/145-library-veterinary -http://www.survivorlibrary.com/index.php/8-category/33-library-construction -http://www.survivorlibrary.com/index.php/8-category/70-library-horses -http://www.survivorlibrary.com/index.php/8-category/105-library-optometry -http://www.survivorlibrary.com/index.php/8-category/146-library-wagon%20and%20coach%20building -http://www.survivorlibrary.com/index.php/8-category/34-library-cooking%20and%20cookbooks -http://www.survivorlibrary.com/index.php/8-category/71-library-journalism -http://www.survivorlibrary.com/index.php/8-category/106-library-painting%20and%20drawing -http://www.survivorlibrary.com/index.php/8-category/147-library-weaving -http://www.survivorlibrary.com/index.php/8-category/35-library-cotton -http://www.survivorlibrary.com/index.php/8-category/72-library-knitting-lace-needlepoint -http://www.survivorlibrary.com/index.php/8-category/107-library-papermaking -http://www.survivorlibrary.com/index.php/8-category/148-library-welding -http://www.survivorlibrary.com/index.php/8-category/36-library-cycles%20(bi-tri-motor) -http://www.survivorlibrary.com/index.php/8-category/73-library-laundry -http://www.survivorlibrary.com/index.php/8-category/108-library-photography -http://www.survivorlibrary.com/index.php/8-category/149-library-wind%20and%20water -http://www.survivorlibrary.com/index.php/8-category/37-library-dentistry -http://www.survivorlibrary.com/index.php/8-category/173-library-law -http://www.survivorlibrary.com/index.php/8-category/109-library-pottery -http://www.survivorlibrary.com/index.php/8-category/151-library-wood-carpentry -http://www.survivorlibrary.com/index.php/8-category/38-library-drilling -http://www.survivorlibrary.com/index.php/8-category/74-library-leather -http://www.survivorlibrary.com/index.php/8-category/110-library-poultry -http://www.survivorlibrary.com/index.php/8-category/152-library-wood-carving -http://www.survivorlibrary.com/index.php/8-category/39-library-economics -http://www.survivorlibrary.com/index.php/8-category/75-library-leisure-games%20and%20sports -http://www.survivorlibrary.com/index.php/8-category/111-library-primers -http://www.survivorlibrary.com/index.php/8-category/153-library-wood-furniture -http://www.survivorlibrary.com/index.php/8-category/40-library-embalming -http://www.survivorlibrary.com/index.php/8-category/76-library-leisure-recreation%20magazine -http://www.survivorlibrary.com/index.php/8-category/112-library-printing -http://www.survivorlibrary.com/index.php/8-category/154-library-work%20magazine -http://www.survivorlibrary.com/index.php/8-category/41-library-encyclopedias -http://www.survivorlibrary.com/index.php/8-category/77-library-leisure-whist -http://www.survivorlibrary.com/index.php/8-category/113-library-radio -http://www.survivorlibrary.com/index.php/8-category/118-library-scientific%20american%20(series%201) -http://www.survivorlibrary.com/index.php/8-category/42-library-engineering-drainage -http://www.survivorlibrary.com/index.php/8-category/78-library-lithography -http://www.survivorlibrary.com/index.php/8-category/114-library-radio%2073%20magazine -http://www.survivorlibrary.com/index.php/8-category/119-library-scientific%20american%20(series%202) \ No newline at end of file diff --git a/parse_html_pages.js b/parse_html_pages.js index 15e84f9..5b44172 100644 --- a/parse_html_pages.js +++ b/parse_html_pages.js @@ -1,10 +1,18 @@ const { parse } = require('node-html-parser'); const fs = require('fs/promises'); +/** + * This is a really hacky and ugly method, but essentially we're relying on Node.js to + * parse the HTML and grab all the PDF links. + * + * To be honest, this could've been done with some fancy regex or whatever, but who cares. + */ async function parseHtml() { const pages = await fs.readdir('./pages'); const pdfUrls = []; + const folders = []; + const folderLinks = {}; for (const page of pages) { @@ -12,6 +20,28 @@ async function parseHtml() const root = parse(html); const potentialUrls = root.querySelectorAll('a'); + let folderName = root.querySelector('title').textContent; + // Normalize the folder name + folderName = folderName.replaceAll('/', '_') + .replace('Library-', '') + .replace('LIBRARY-', '') + .trim() + .replaceAll(/\s{2,}/g, ' '); + + // A few special cases, let's just handle these explicitly + if (folderName === 'AEROPLANES') { + folderName = 'Aeroplanes'; + } + + if (folderName === 'ACCOUNTING') { + folderName = 'Accounting'; + } + + if (!folders.includes(folderName)) { + folders.push(folderName); + folderLinks[folderName] = []; + } + let validUrls = 0; for (const anchor of potentialUrls) @@ -24,7 +54,10 @@ async function parseHtml() continue; } + // All URLs in the `href` attributes are relative, so we fix em up. const fullUrl = `http://www.survivorlibrary.com${url}`; + const filename = url.split('/').pop(); + folderLinks[folderName].push(filename); // Duplicate if (pdfUrls.includes(fullUrl)) { @@ -39,12 +72,42 @@ async function parseHtml() console.log(`Found ${validUrls} PDF URLs (out of ${potentialUrls.length} potential URLs) in ${page}`); } + /** + * Since the `pdfUrls.sh` script just downloads all the PDFs into one directory, + * We create a script that hardlinks each file to the correct categories. + * + * There are likely some PDFs that are duplicated within 2+ categories, but that's fine imo. + */ + let folderLinkCmds = ['mkdir -p Sorted']; + for (const folder of folders) + { + const links = folderLinks[folder]; + folderLinkCmds.push(`mkdir -p 'Sorted/${folder}'`); + + for (const filename of links) + { + folderLinkCmds.push(`ln '${filename}' 'Sorted/${folder}/${filename}'`); + } + } + await fs.writeFile('./pdfUrls.json', JSON.stringify(pdfUrls)); await fs.writeFile('./pdfUrls.txt', pdfUrls.join('\n')); + await fs.writeFile('./folders.json', JSON.stringify(folders, null, 4)); + await fs.writeFile('./folderLink.sh', folderLinkCmds.join('\n')); + /** + * It seems the web server for SurvivalLibrary doesn't support + * the `Range` HTTP header. We can't just "continue" a download. + * + * I wouldn't be surprised if one (or more) of the PDFs end up corrupted, as we just check if the file _exists_ + * before skipping it (if it does exist). + * + * An alternative would be to attempt to parse each PDF locally + * to verify that it's at least a valid PDF. + */ const scriptOutput = pdfUrls.map(url => { const filename = url.split('/').pop(); - return `[ ! -f "${filename}" ] && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - "${url}"`; + return `[ ! -f '${filename}' ] && (echo "Downloading: ${filename}" && curl -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -fsSLO -C - -g '${url}');`; }); await fs.writeFile('./pdfUrls.sh', scriptOutput.join('\n')); diff --git a/validate_pdfs.sh b/validate_pdfs.sh new file mode 100644 index 0000000..1209e1a --- /dev/null +++ b/validate_pdfs.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Requires you install `poppler-utils` for the `pdfinfo` command +# On Debian/Ubuntu it should be available as `poppler-utils` + +# List of broken PDFs are outputted to the `validate.log` file. +# Most of the code provided by a StackOverflow answer: https://superuser.com/a/580895 + +# I highly recommend manually verifying the PDFs that are considered "broken" by this script. +# When running this script, it found 19 "broken" PDFs. +# 1 of them wasn't broken. The other 18 were. +# 1 of the remaining 18 was just a corrupted download, +# but the other 17 were actually broken and didn't even work when attempting to load from survivorlibrary.com + +for f in *.pdf; do + if ! pdfinfo "$f" &> /dev/null; then + now="$(date +"%Y-%m-%d %H:%M:%S")"; + echo "[$now] $f is broken" >> validate.log; + fi +done \ No newline at end of file