#!/bin/bash # The script will basically just request the HTML page of each URL # Dump it into individual files in the `pages` directory # The pages are requested with a 1.5 second pause in-between. # So it'll take roughly 6 minutes to grab all the pages. # # The reason all the pages are dumped is so that we can process them "offline" in peace, # without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked). OUTPUT_DIR="pages"; mkdir -p "${OUTPUT_DIR}"; for page in $(cat survivorlibrary_pages.txt); do name="$(echo $page | cut -d'/' -f 6 | sed 's/%20/_/g').html"; file_path="${OUTPUT_DIR}/${name}"; echo $file_path; curl -fsSL -o "${file_path}" -H "Referer: http://www.survivorlibrary.com/library-download.html" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -C - "$page"; sleep 1.5s; done