SurvivorLibraryScrape/get_pages_with_pdfs.sh

21 lines
922 B
Bash

#!/bin/bash
# The script will basically just request the HTML page of each URL
# Dump it into individual files in the `pages` directory
# The pages are requested with a 1.5 second pause in-between.
# So it'll take roughly 6 minutes to grab all the pages.
#
# The reason all the pages are dumped is so that we can process them "offline" in peace,
# without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked).
OUTPUT_DIR="pages";
mkdir -p "${OUTPUT_DIR}";
for page in $(cat survivorlibrary_pages.txt);
do
name="$(echo $page | cut -d'/' -f 6 | sed 's/%20/_/g').html";
file_path="${OUTPUT_DIR}/${name}";
echo $file_path;
curl -fsSL -o "${file_path}" -H "Referer: http://www.survivorlibrary.com/library-download.html" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -C - "$page";
sleep 1.5s;
done