SurvivorLibraryScrape/get_pages_with_pdfs.sh

#!/bin/bash

# The script will basically just request the HTML page of each URL
# Dump it into individual files in the `pages` directory
# The pages are requested with a 1.5 second pause in-between.
# So it'll take roughly 6 minutes to grab all the pages.
#
# The reason all the pages are dumped is so that we can process them "offline" in peace,
# without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked).
OUTPUT_DIR="pages";

mkdir -p "${OUTPUT_DIR}";

for page in $(cat survivorlibrary_pages.txt);
do
    name="$(echo $page | cut -d'/' -f 6 | sed 's/%20/_/g').html";
    file_path="${OUTPUT_DIR}/${name}";
    echo $file_path;
    curl -fsSL -o "${file_path}" -H "Referer: http://www.survivorlibrary.com/library-download.html" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -C - "$page";
    sleep 1.5s;
done
Initial commit 2022-02-03 15:53:39 +01:00			`#!/bin/bash`

Add more scripts and tools 2022-02-07 10:30:15 +01:00			`# The script will basically just request the HTML page of each URL`
			# Dump it into individual files in the `pages` directory
			`# The pages are requested with a 1.5 second pause in-between.`
			`# So it'll take roughly 6 minutes to grab all the pages.`
			`#`
			`# The reason all the pages are dumped is so that we can process them "offline" in peace,`
			`# without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked).`
Initial commit 2022-02-03 15:53:39 +01:00			`OUTPUT_DIR="pages";`

			`mkdir -p "${OUTPUT_DIR}";`

			`for page in $(cat survivorlibrary_pages.txt);`
			`do`
			`name="$(echo $page \| cut -d'/' -f 6 \| sed 's/%20/_/g').html";`
			`file_path="${OUTPUT_DIR}/${name}";`
			`echo $file_path;`
			`curl -fsSL -o "${file_path}" -H "Referer: http://www.survivorlibrary.com/library-download.html" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -C - "$page";`
			`sleep 1.5s;`
			`done`