2009scape-website/parse.py
2021-09-19 14:57:02 -07:00

37 lines
1.3 KiB
Python

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from os import listdir
from os.path import isfile, join
mypath = "C:\\Users\\Anon\\Projects\\2009scape-website\\site\\community\\"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for f in onlyfiles:
url = "http://localhost:8000/site/community/" + f
print(url)
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')
## Jekyll
tmp = open("test/"+f, "w")
tmp.write('---\n')
title = soup.title.string.replace("2009scape - ","")
title = title.replace("2009Scape - ","")
tmp.write('title: '+title+'\n')
tmp.write('tags: community'+'\n')
tmp.write('layout: guide'+'\n')
#tmp.write('collection: Game Updates'+'\n')
#tmp.write('date: '+f[:-5]+" 00:00:00 +0000"+'\n')
#tmp.write('authors: '+soup.find("div", {"class": "msgcreator uname"}).text.strip()+"\n")
tmp.write('---\n')
for hit in soup.findAll(attrs={'id' : 'content'}):
tmp.write(unidecode(str(hit)))
tmp.close()