# -*- coding: utf-8 -*- # Copyright 2015-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://www.deviantart.com/""" from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache import collections import itertools import mimetypes import math import time import re BASE_PATTERN = ( r"(?:https?://)?(?:" r"(?:www\.)?deviantart\.com/([\w-]+)|" r"(?!www\.)([\w-]+)\.deviantart\.com)" ) class DeviantartExtractor(Extractor): """Base class for deviantart extractors using the OAuth API""" category = "deviantart" directory_fmt = ("{category}", "{author[username]!l}") filename_fmt = "{category}_{index}_{title}.{extension}" root = "https://www.deviantart.com" def __init__(self, match=None): Extractor.__init__(self, match) self.offset = 0 self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.quality = self.config("quality", "100") self.original = self.config("original", True) self.user = match.group(1) or match.group(2) self.group = False self.api = DeviantartAPI(self) if self.quality: self.quality = "q_{}".format(self.quality) if self.original != "image": self._update_content = self._update_content_default else: self._update_content = self._update_content_image self.original = True self.commit_journal = { "html": self._commit_journal_html, "text": self._commit_journal_text, }.get(self.config("journals", "html")) def skip(self, num): self.offset += num return num def items(self): if self.user: self.group = not self.api.user_profile(self.user) if self.group: self.subcategory = "group-" + self.subcategory yield Message.Version, 1 for deviation in self.deviations(): if isinstance(deviation, tuple): url, data = deviation yield Message.Queue, url, data continue self.prepare(deviation) yield Message.Directory, deviation if "content" in deviation: content = deviation["content"] if self.original and deviation["is_downloadable"] and \ text.ext_from_url(content["src"]) != "gif": self._update_content(deviation, content) if content["src"].startswith("https://images-wixmp-"): if deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 content["src"] = re.sub( r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", content["src"]) if self.quality: content["src"] = re.sub( r"q_\d+", self.quality, content["src"]) yield self.commit(deviation, content) elif deviation["is_downloadable"]: content = self.api.deviation_download(deviation["deviationid"]) yield self.commit(deviation, content) if "videos" in deviation: video = max(deviation["videos"], key=lambda x: text.parse_int(x["quality"][:-1])) yield self.commit(deviation, video) if "flash" in deviation: yield self.commit(deviation, deviation["flash"]) if "excerpt" in deviation and self.commit_journal: journal = self.api.deviation_content(deviation["deviationid"]) yield self.commit_journal(deviation, journal) if self.extra: for match in DeviantartStashExtractor.pattern.finditer( deviation.get("description", "")): deviation["_extractor"] = DeviantartStashExtractor yield Message.Queue, match.group(0), deviation def deviations(self): """Return an iterable containing all relevant Deviation-objects""" def prepare(self, deviation): """Adjust the contents of a Deviation-object""" try: deviation["index"] = text.parse_int( deviation["url"].rpartition("-")[2]) except KeyError: deviation["index"] = 0 if self.user: deviation["username"] = self.user deviation["da_category"] = deviation["category"] deviation["published_time"] = text.parse_int( deviation["published_time"]) deviation["date"] = text.parse_timestamp( deviation["published_time"]) # filename metadata alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" sub = re.compile(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", util.bencode(deviation["index"], alphabet), )) @staticmethod def commit(deviation, target): url = target["src"] target = target.copy() target["filename"] = deviation["filename"] deviation["target"] = target deviation["extension"] = target["extension"] = text.ext_from_url(url) return Message.Url, url, deviation def _commit_journal_html(self, deviation, journal): title = text.escape(deviation["title"]) url = deviation["url"] thumbs = deviation.get("thumbs") or deviation.get("files") html = journal["html"] shadow = SHADOW_TEMPLATE.format_map(thumbs[0]) if thumbs else "" if "css" in journal: css, cls = journal["css"], "withskin" elif html.startswith("") css = css.partition(">")[2] cls = "withskin" else: css, cls = "", "journal-green" if html.find('
', 0, 250) != -1: needle = '
' header = HEADER_CUSTOM_TEMPLATE.format( title=title, url=url, date=deviation["date"], ) else: needle = '
' catlist = deviation["category_path"].split("/") categories = " / ".join( ('{}' '').format(self.root, cpath, cat.capitalize()) for cat, cpath in zip( catlist, itertools.accumulate(catlist, lambda t, c: t + "/" + c) ) ) username = deviation["author"]["username"] urlname = deviation.get("username") or username.lower() header = HEADER_TEMPLATE.format( title=title, url=url, userurl="{}/{}/".format(self.root, urlname), username=username, date=deviation["date"], categories=categories, ) if needle in html: html = html.replace(needle, header, 1) else: html = JOURNAL_TEMPLATE_HTML_EXTRA.format(header, html) html = JOURNAL_TEMPLATE_HTML.format( title=title, html=html, shadow=shadow, css=css, cls=cls) deviation["extension"] = "htm" return Message.Url, html, deviation @staticmethod def _commit_journal_text(deviation, journal): html = journal["html"] if html.startswith("")[2] content = "\n".join( text.unescape(text.remove_html(txt)) for txt in html.rpartition("") ) txt = JOURNAL_TEMPLATE_TEXT.format( title=deviation["title"], username=deviation["author"]["username"], date=deviation["date"], content=content, ) deviation["extension"] = "txt" return Message.Url, txt, deviation @staticmethod def _find_folder(folders, name): pattern = re.compile(r"(?i)\W*" + name.replace("-", r"\W+") + r"\W*$") for folder in folders: if pattern.match(folder["name"]): return folder raise exception.NotFoundError("folder") def _folder_urls(self, folders, category): url = "{}/{}/{}/0/".format(self.root, self.user, category) return [(url + folder["name"], folder) for folder in folders] def _update_content_default(self, deviation, content): content.update(self.api.deviation_download(deviation["deviationid"])) def _update_content_image(self, deviation, content): data = self.api.deviation_download(deviation["deviationid"]) url = data["src"].partition("?")[0] mtype = mimetypes.guess_type(url, False)[0] if mtype and mtype.startswith("image/"): content.update(data) class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" archive_fmt = "g_{username}_{index}.{extension}" pattern = BASE_PATTERN + r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$" test = ( ("https://www.deviantart.com/shimoda7/gallery/", { "pattern": r"https://(s3.amazonaws.com/origin-(img|orig)" r".deviantart.net/|images-wixmp-\w+.wixmp.com/)", "count": ">= 30", "keyword": { "allows_comments": bool, "author": { "type": "regular", "usericon": str, "userid": "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B", "username": "shimoda7", }, "category_path": str, "content": { "filesize": int, "height": int, "src": str, "transparency": bool, "width": int, }, "da_category": str, "date": "type:datetime", "deviationid": str, "?download_filesize": int, "extension": str, "index": int, "is_deleted": bool, "is_downloadable": bool, "is_favourited": bool, "is_mature": bool, "preview": { "height": int, "src": str, "transparency": bool, "width": int, }, "published_time": int, "stats": { "comments": int, "favourites": int, }, "target": dict, "thumbs": list, "title": str, "url": r"re:https://www.deviantart.com/shimoda7/art/[^/]+-\d+", "username": "shimoda7", }, }), # group ("https://www.deviantart.com/yakuzafc", { "pattern": r"https://www.deviantart.com/yakuzafc/gallery/0/", "count": ">= 15", }), # 'folders' option (#276) ("https://www.deviantart.com/justatest235723", { "count": 3, "options": (("metadata", 1), ("folders", 1), ("original", 0)), "keyword": { "description": str, "folders": list, "is_watching": bool, "license": str, "tags": list, }, }), ("https://www.deviantart.com/shimoda8/gallery/", { "exception": exception.NotFoundError, }), # old-style URLs ("https://www.deviantart.com/shimoda7/gallery/?catpath=/"), ("https://shimoda7.deviantart.com/gallery/"), ("https://yakuzafc.deviantart.com/"), ("https://shimoda7.deviantart.com/gallery/?catpath=/"), ) def deviations(self): if self.flat and not self.group: return self.api.gallery_all(self.user, self.offset) folders = self.api.gallery_folders(self.user) return self._folder_urls(folders, "gallery") class DeviantartFolderExtractor(DeviantartExtractor): """Extractor for deviations inside an artist's gallery folder""" subcategory = "folder" directory_fmt = ("{category}", "{folder[owner]}", "{folder[title]}") archive_fmt = "F_{folder[uuid]}_{index}.{extension}" pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?&#]+)" test = ( # user ("https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", { "count": 5, "options": (("original", False),), }), # group ("https://www.deviantart.com/yakuzafc/gallery/37412168/Crafts", { "count": ">= 4", "options": (("original", False),), }), ("https://shimoda7.deviantart.com/gallery/722019/Miscellaneous"), ("https://yakuzafc.deviantart.com/gallery/37412168/Crafts"), ) def __init__(self, match): DeviantartExtractor.__init__(self, match) self.fname = match.group(4) self.folder = {"owner": self.user, "index": match.group(3)} def deviations(self): folders = self.api.gallery_folders(self.user) folder = self._find_folder(folders, self.fname) self.folder["title"] = folder["name"] self.folder["uuid"] = folder["folderid"] return self.api.gallery(self.user, folder["folderid"], self.offset) def prepare(self, deviation): DeviantartExtractor.prepare(self, deviation) deviation["folder"] = self.folder class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" test = ( ("https://sta.sh/022c83odnaxc", { "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net", "count": 1, }), # multiple stash items ("https://sta.sh/21jf51j7pzl2", { "pattern": pattern, "count": 4, }), # downloadable, but no "content" field (#307) ("https://sta.sh/024t4coz16mi", { "count": 1, }), ("https://sta.sh/abcdefghijkl", { "exception": exception.HttpError, }), ) skip = Extractor.skip def __init__(self, match): DeviantartExtractor.__init__(self, match) self.user = None self.stash_id = match.group(1) def deviations(self): url = "https://sta.sh/" + self.stash_id page = self.request(url).text deviation_id = text.extract(page, '//deviation/', '"')[0] if deviation_id: yield self.api.deviation(deviation_id) else: data = {"_extractor": DeviantartStashExtractor} page = text.extract( page, '