2009scape-website/parse.py

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode

from os import listdir
from os.path import isfile, join

mypath = "C:\\Users\\Anon\\Projects\\2009scape-website\\site\\community\\"

onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

for f in onlyfiles:
    url = "http://localhost:8000/site/community/" + f
    print(url)
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    ## Jekyll
    tmp = open("test/"+f, "w")
    tmp.write('---\n')

    title = soup.title.string.replace("2009scape - ","")
    title = title.replace("2009Scape - ","")

    tmp.write('title: '+title+'\n')
    tmp.write('tags: community'+'\n')
    tmp.write('layout: guide'+'\n')
    #tmp.write('collection: Game Updates'+'\n')
    #tmp.write('date: '+f[:-5]+" 00:00:00 +0000"+'\n')
    #tmp.write('authors: '+soup.find("div", {"class": "msgcreator uname"}).text.strip()+"\n")
    tmp.write('---\n')
    for hit in soup.findAll(attrs={'id' : 'content'}):
        tmp.write(unidecode(str(hit)))
    tmp.close()