2022-02-03 15:53:39 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
2022-02-07 10:30:15 +01:00
|
|
|
# The script will basically just request the HTML page of each URL
|
|
|
|
# Dump it into individual files in the `pages` directory
|
|
|
|
# The pages are requested with a 1.5 second pause in-between.
|
|
|
|
# So it'll take roughly 6 minutes to grab all the pages.
|
|
|
|
#
|
|
|
|
# The reason all the pages are dumped is so that we can process them "offline" in peace,
|
|
|
|
# without hitting the website every time we want to process all the categories (e.g. when the other scripts are tweaked).
|
2022-02-03 15:53:39 +01:00
|
|
|
OUTPUT_DIR="pages";
|
|
|
|
|
|
|
|
mkdir -p "${OUTPUT_DIR}";
|
|
|
|
|
|
|
|
for page in $(cat survivorlibrary_pages.txt);
|
|
|
|
do
|
|
|
|
name="$(echo $page | cut -d'/' -f 6 | sed 's/%20/_/g').html";
|
|
|
|
file_path="${OUTPUT_DIR}/${name}";
|
|
|
|
echo $file_path;
|
|
|
|
curl -fsSL -o "${file_path}" -H "Referer: http://www.survivorlibrary.com/library-download.html" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0" -C - "$page";
|
|
|
|
sleep 1.5s;
|
|
|
|
done
|