2020-09-25 15:18:21 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for https://www.weasyl.com/"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text
|
|
|
|
|
|
|
|
BASE_PATTERN = r"(?:https://)?(?:www\.)?weasyl.com/"
|
|
|
|
|
|
|
|
|
|
|
|
class WeasylExtractor(Extractor):
|
|
|
|
category = "weasyl"
|
|
|
|
directory_fmt = ("{category}", "{owner_login}")
|
2020-09-25 23:43:11 +02:00
|
|
|
filename_fmt = "{submitid} {title}.{extension}"
|
2020-09-25 15:18:21 +02:00
|
|
|
archive_fmt = "{submitid}"
|
|
|
|
root = "https://www.weasyl.com"
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def populate_submission(data):
|
|
|
|
# Some submissions don't have content and can be skipped
|
|
|
|
if "submission" in data["media"]:
|
|
|
|
data["url"] = data["media"]["submission"][0]["url"]
|
2020-09-25 23:43:11 +02:00
|
|
|
data["date"] = text.parse_datetime(
|
|
|
|
data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
|
|
text.nameext_from_url(data["url"], data)
|
2020-09-25 15:18:21 +02:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2023-07-25 20:09:44 +02:00
|
|
|
def _init(self):
|
2020-10-15 15:12:09 +02:00
|
|
|
self.session.headers['X-Weasyl-API-Key'] = self.config("api-key")
|
|
|
|
|
2020-09-25 15:18:21 +02:00
|
|
|
def request_submission(self, submitid):
|
|
|
|
return self.request(
|
|
|
|
"{}/api/submissions/{}/view".format(self.root, submitid)).json()
|
|
|
|
|
2020-09-25 23:43:11 +02:00
|
|
|
def retrieve_journal(self, journalid):
|
2020-09-25 15:18:21 +02:00
|
|
|
data = self.request(
|
2020-09-25 23:43:11 +02:00
|
|
|
"{}/api/journals/{}/view".format(self.root, journalid)).json()
|
2020-09-25 15:18:21 +02:00
|
|
|
data["extension"] = "html"
|
|
|
|
data["html"] = "text:" + data["content"]
|
2020-09-25 23:43:11 +02:00
|
|
|
data["date"] = text.parse_datetime(data["posted_at"])
|
2020-09-25 15:18:21 +02:00
|
|
|
return data
|
|
|
|
|
2020-09-25 23:43:11 +02:00
|
|
|
def submissions(self, owner_login, folderid=None):
|
2022-05-20 22:32:35 +02:00
|
|
|
metadata = self.config("metadata")
|
2020-09-25 23:43:11 +02:00
|
|
|
url = "{}/api/users/{}/gallery".format(self.root, owner_login)
|
|
|
|
params = {
|
|
|
|
"nextid" : None,
|
|
|
|
"folderid": folderid,
|
|
|
|
}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
data = self.request(url, params=params).json()
|
|
|
|
for submission in data["submissions"]:
|
2022-05-20 22:32:35 +02:00
|
|
|
if metadata:
|
|
|
|
submission = self.request_submission(
|
|
|
|
submission["submitid"])
|
2020-09-25 23:43:11 +02:00
|
|
|
if self.populate_submission(submission):
|
|
|
|
submission["folderid"] = folderid
|
2020-09-25 15:18:21 +02:00
|
|
|
# Do any submissions have more than one url? If so
|
|
|
|
# a urllist of the submission array urls would work.
|
2020-09-25 23:43:11 +02:00
|
|
|
yield Message.Url, submission["url"], submission
|
|
|
|
if not data["nextid"]:
|
|
|
|
return
|
|
|
|
params["nextid"] = data["nextid"]
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
|
|
|
|
class WeasylSubmissionExtractor(WeasylExtractor):
|
|
|
|
subcategory = "submission"
|
2020-10-15 15:12:09 +02:00
|
|
|
pattern = BASE_PATTERN + r"(?:~[\w~-]+/submissions|submission)/(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.weasyl.com/~USER/submissions/12345/TITLE"
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
WeasylExtractor.__init__(self, match)
|
2020-09-25 23:43:11 +02:00
|
|
|
self.submitid = match.group(1)
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
data = self.request_submission(self.submitid)
|
|
|
|
if self.populate_submission(data):
|
2020-09-25 23:43:11 +02:00
|
|
|
yield Message.Directory, data
|
2020-09-25 15:18:21 +02:00
|
|
|
yield Message.Url, data["url"], data
|
|
|
|
|
|
|
|
|
|
|
|
class WeasylSubmissionsExtractor(WeasylExtractor):
|
|
|
|
subcategory = "submissions"
|
2020-10-15 15:12:09 +02:00
|
|
|
pattern = BASE_PATTERN + r"(?:~|submissions/)([\w~-]+)/?$"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.weasyl.com/submissions/USER"
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
WeasylExtractor.__init__(self, match)
|
2020-09-25 23:43:11 +02:00
|
|
|
self.owner_login = match.group(1)
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
yield Message.Directory, {"owner_login": self.owner_login}
|
2020-09-25 23:43:11 +02:00
|
|
|
yield from self.submissions(self.owner_login)
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
|
|
|
|
class WeasylFolderExtractor(WeasylExtractor):
|
|
|
|
subcategory = "folder"
|
|
|
|
directory_fmt = ("{category}", "{owner_login}", "{folder_name}")
|
2020-10-15 15:12:09 +02:00
|
|
|
pattern = BASE_PATTERN + r"submissions/([\w~-]+)\?folderid=(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.weasyl.com/submissions/USER?folderid=12345"
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
WeasylExtractor.__init__(self, match)
|
2020-09-25 23:43:11 +02:00
|
|
|
self.owner_login, self.folderid = match.groups()
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def items(self):
|
2020-09-25 23:43:11 +02:00
|
|
|
iter = self.submissions(self.owner_login, self.folderid)
|
2020-09-25 15:18:21 +02:00
|
|
|
# Folder names are only on single submission api calls
|
|
|
|
msg, url, data = next(iter)
|
|
|
|
details = self.request_submission(data["submitid"])
|
|
|
|
yield Message.Directory, details
|
|
|
|
yield msg, url, data
|
|
|
|
yield from iter
|
|
|
|
|
|
|
|
|
|
|
|
class WeasylJournalExtractor(WeasylExtractor):
|
|
|
|
subcategory = "journal"
|
2020-09-25 23:43:11 +02:00
|
|
|
filename_fmt = "{journalid} {title}.{extension}"
|
2020-09-25 15:18:21 +02:00
|
|
|
archive_fmt = "{journalid}"
|
2020-09-25 23:43:11 +02:00
|
|
|
pattern = BASE_PATTERN + r"journal/(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.weasyl.com/journal/12345"
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
WeasylExtractor.__init__(self, match)
|
2020-09-25 23:43:11 +02:00
|
|
|
self.journalid = match.group(1)
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
data = self.retrieve_journal(self.journalid)
|
|
|
|
yield Message.Directory, data
|
|
|
|
yield Message.Url, data["html"], data
|
|
|
|
|
|
|
|
|
|
|
|
class WeasylJournalsExtractor(WeasylExtractor):
|
|
|
|
subcategory = "journals"
|
2020-09-25 23:43:11 +02:00
|
|
|
filename_fmt = "{journalid} {title}.{extension}"
|
2020-09-25 15:18:21 +02:00
|
|
|
archive_fmt = "{journalid}"
|
2020-10-15 15:12:09 +02:00
|
|
|
pattern = BASE_PATTERN + r"journals/([\w~-]+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.weasyl.com/journals/USER"
|
2020-09-25 15:18:21 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
WeasylExtractor.__init__(self, match)
|
|
|
|
self.owner_login = match.group(1)
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
yield Message.Directory, {"owner_login": self.owner_login}
|
2020-09-25 23:43:11 +02:00
|
|
|
|
|
|
|
url = "{}/journals/{}".format(self.root, self.owner_login)
|
|
|
|
page = self.request(url).text
|
|
|
|
for journalid in text.extract_iter(page, 'href="/journal/', '/'):
|
|
|
|
data = self.retrieve_journal(journalid)
|
2020-09-25 15:18:21 +02:00
|
|
|
yield Message.Url, data["html"], data
|
2020-09-26 02:11:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
class WeasylFavoriteExtractor(WeasylExtractor):
|
|
|
|
subcategory = "favorite"
|
|
|
|
directory_fmt = ("{category}", "{owner_login}", "Favorites")
|
2021-03-12 16:56:04 +01:00
|
|
|
pattern = BASE_PATTERN + r"favorites\?userid=(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.weasyl.com/favorites?userid=12345"
|
2020-09-26 02:11:37 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
WeasylExtractor.__init__(self, match)
|
|
|
|
self.userid = match.group(1)
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
owner_login = lastid = None
|
|
|
|
url = self.root + "/favorites"
|
|
|
|
params = {
|
|
|
|
"userid" : self.userid,
|
|
|
|
"feature": "submit",
|
|
|
|
}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
page = self.request(url, params=params).text
|
|
|
|
pos = page.index('id="favorites-content"')
|
|
|
|
|
|
|
|
if not owner_login:
|
2022-11-04 23:39:38 +01:00
|
|
|
owner_login = text.extr(page, '<a href="/~', '"')
|
2020-09-26 02:11:37 +02:00
|
|
|
|
|
|
|
for submitid in text.extract_iter(page, "/submissions/", "/", pos):
|
|
|
|
if submitid == lastid:
|
|
|
|
continue
|
|
|
|
lastid = submitid
|
|
|
|
submission = self.request_submission(submitid)
|
|
|
|
if self.populate_submission(submission):
|
2020-11-07 18:46:53 +01:00
|
|
|
submission["user"] = owner_login
|
|
|
|
yield Message.Directory, submission
|
2020-09-26 02:11:37 +02:00
|
|
|
yield Message.Url, submission["url"], submission
|
|
|
|
|
|
|
|
if "&nextid=" not in page:
|
|
|
|
return
|
|
|
|
params["nextid"] = submitid
|