2015-10-04 04:13:50 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2017-02-23 20:58:39 +01:00
|
|
|
# Copyright 2015-2017 Mike Fährmann
|
2015-10-04 04:13:50 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extract images from http://www.deviantart.com/"""
|
|
|
|
|
2017-01-12 21:08:49 +01:00
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text, exception
|
|
|
|
from ..cache import cache
|
2017-03-08 16:40:20 +01:00
|
|
|
import time
|
2015-10-04 04:13:50 +02:00
|
|
|
|
2017-01-12 21:08:49 +01:00
|
|
|
|
|
|
|
class DeviantartUserExtractor(Extractor):
|
2016-09-12 10:20:57 +02:00
|
|
|
"""Extractor for all works from an artist on deviantart.com"""
|
2015-11-21 04:26:30 +01:00
|
|
|
category = "deviantart"
|
2015-12-06 21:13:57 +01:00
|
|
|
subcategory = "user"
|
2017-01-12 21:08:49 +01:00
|
|
|
directory_fmt = ["{category}", "{username}"]
|
2015-11-21 04:26:30 +01:00
|
|
|
filename_fmt = "{category}_{index}_{title}.{extension}"
|
2015-12-06 14:24:27 +01:00
|
|
|
pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com(?:/gallery)?/?$"]
|
2015-12-13 04:36:44 +01:00
|
|
|
test = [("http://shimoda7.deviantart.com/gallery/", {
|
2017-02-23 20:58:39 +01:00
|
|
|
"url": "63bfa8efba199e27181943c9060f6770f91a8441",
|
|
|
|
"keyword": "4ffe227a50f373faf643d7e5ae89a04859af8d19",
|
2015-12-13 04:36:44 +01:00
|
|
|
})]
|
2015-11-21 04:26:30 +01:00
|
|
|
|
2015-10-05 17:06:58 +02:00
|
|
|
def __init__(self, match):
|
2017-01-12 21:08:49 +01:00
|
|
|
Extractor.__init__(self)
|
2017-03-08 16:40:20 +01:00
|
|
|
self.api = DeviantartAPI(self)
|
2017-01-12 21:08:49 +01:00
|
|
|
self.user = match.group(1)
|
2017-03-13 21:42:16 +01:00
|
|
|
self.offset = 0
|
|
|
|
|
|
|
|
def skip(self, num):
|
|
|
|
self.offset += num
|
|
|
|
return num
|
2015-10-04 04:13:50 +02:00
|
|
|
|
|
|
|
def items(self):
|
2017-01-12 21:08:49 +01:00
|
|
|
first = True
|
2015-10-04 04:13:50 +02:00
|
|
|
yield Message.Version, 1
|
2017-03-13 21:42:16 +01:00
|
|
|
for deviation in self.api.gallery_all(self.user, self.offset):
|
2017-02-22 01:42:08 +01:00
|
|
|
if "content" not in deviation:
|
|
|
|
continue
|
2017-01-12 21:08:49 +01:00
|
|
|
if first:
|
|
|
|
first = False
|
2017-02-22 01:42:08 +01:00
|
|
|
yield Message.Directory, deviation["author"]
|
|
|
|
del deviation["stats"]
|
2017-01-12 21:08:49 +01:00
|
|
|
deviation["index"] = deviation["url"].rsplit("-", maxsplit=1)[-1]
|
|
|
|
yield Message.Url, deviation["content"]["src"], deviation
|
2015-12-06 21:13:57 +01:00
|
|
|
|
|
|
|
|
2016-09-12 10:20:57 +02:00
|
|
|
class DeviantartImageExtractor(Extractor):
|
|
|
|
"""Extractor for single images from deviantart.com"""
|
2015-12-06 21:13:57 +01:00
|
|
|
category = "deviantart"
|
|
|
|
subcategory = "image"
|
|
|
|
directory_fmt = ["{category}", "{artist}"]
|
|
|
|
filename_fmt = "{category}_{index}_{title}.{extension}"
|
2016-11-06 10:44:50 +01:00
|
|
|
pattern = [r"(?:https?://)?([^\.]+\.deviantart\.com/art/.+-(\d+))"]
|
2017-02-01 00:53:19 +01:00
|
|
|
test = [(("http://shimoda7.deviantart.com/art/"
|
|
|
|
"For-the-sake-of-a-memory-10073852"), {
|
2015-12-13 04:36:44 +01:00
|
|
|
"url": "71345ce3bef5b19bd2a56d7b96e6b5ddba747c2e",
|
2016-09-25 17:28:46 +02:00
|
|
|
"keyword": "ccac27b8f740fc943afca9460608e02c6cbcdf96",
|
2016-11-06 10:44:50 +01:00
|
|
|
"content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
|
2015-12-13 04:36:44 +01:00
|
|
|
})]
|
2015-12-06 21:13:57 +01:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
Extractor.__init__(self)
|
2016-11-06 10:44:50 +01:00
|
|
|
self.url = "https://" + match.group(1)
|
|
|
|
self.index = match.group(2)
|
|
|
|
self.session.cookies["agegate_state"] = "1"
|
2015-12-06 21:13:57 +01:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
page = self.request(self.url).text
|
2016-11-06 10:44:50 +01:00
|
|
|
data = self.get_data(page)
|
|
|
|
data.update(self.get_image(page))
|
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
tlen = len(data["title"])
|
2016-11-06 10:44:50 +01:00
|
|
|
text.nameext_from_url(data["image"], data)
|
|
|
|
data["title"] = text.unescape(data["title"])
|
2015-12-07 00:08:00 +01:00
|
|
|
data["description"] = text.unescape(text.unescape(data["description"]))
|
2015-12-06 21:13:57 +01:00
|
|
|
data["artist"] = text.extract(data["url"], "//", ".")[0]
|
2017-02-01 00:53:19 +01:00
|
|
|
data["date"] = text.extract(data["date"], ", ", " in ", tlen)[0]
|
2016-11-06 10:44:50 +01:00
|
|
|
|
2015-12-06 21:13:57 +01:00
|
|
|
yield Message.Version, 1
|
|
|
|
yield Message.Directory, data
|
|
|
|
yield Message.Url, data["image"], data
|
2016-11-06 10:44:50 +01:00
|
|
|
|
|
|
|
def get_data(self, page):
|
|
|
|
"""Collect metadata for extractor-job"""
|
|
|
|
return text.extract_all(page, (
|
|
|
|
('title' , '"og:title" content="', '"'),
|
|
|
|
('url' , '"og:url" content="', '"'),
|
|
|
|
('description', '"og:description" content="', '"'),
|
|
|
|
(None , '<span class="tt-w">', ''),
|
|
|
|
('date' , 'title="', '"'),
|
|
|
|
), values={"index": self.index})[0]
|
|
|
|
|
|
|
|
def get_image(self, page):
|
|
|
|
"""Find image-url and -dimensions"""
|
|
|
|
# try preview
|
|
|
|
data, pos = text.extract_all(page, (
|
|
|
|
('image' , '"og:image" content="', '"'),
|
|
|
|
('width' , '"og:image:width" content="', '"'),
|
|
|
|
('height', '"og:image:height" content="', '"'),
|
|
|
|
))
|
|
|
|
if data["image"].startswith("https://orig"):
|
|
|
|
return data
|
|
|
|
|
|
|
|
# try main image
|
|
|
|
data, pos = text.extract_all(page, (
|
|
|
|
(None , 'class="dev-content-normal "', ''),
|
|
|
|
('image' , ' src="', '"'),
|
|
|
|
('width' , ' width="', '"'),
|
|
|
|
('height', ' height="', '"'),
|
|
|
|
), pos)
|
|
|
|
if data["image"].startswith("https://orig"):
|
|
|
|
return data
|
|
|
|
|
|
|
|
# try download
|
|
|
|
test, pos = text.extract(page, 'dev-page-download', '', pos)
|
|
|
|
if test is not None:
|
|
|
|
data, pos = text.extract_all(page, (
|
|
|
|
('image' , 'href="', '"'),
|
|
|
|
(None , '<span class="text">', ' '),
|
|
|
|
('width' , '', ' '),
|
|
|
|
('height', ' ', '<'),
|
|
|
|
), pos)
|
|
|
|
response = self.session.head(text.unescape(data["image"]))
|
|
|
|
data["image"] = response.headers["Location"]
|
|
|
|
|
|
|
|
return data
|
2017-01-12 21:08:49 +01:00
|
|
|
|
|
|
|
|
|
|
|
class DeviantartAPI():
|
|
|
|
"""Minimal interface for the deviantart API"""
|
2017-03-08 16:40:20 +01:00
|
|
|
def __init__(self, extractor, client_id="5388",
|
2017-01-12 21:08:49 +01:00
|
|
|
client_secret="76b08c69cfb27f26d6161f9ab6d061a1"):
|
2017-03-08 16:40:20 +01:00
|
|
|
self.session = extractor.session
|
|
|
|
self.session.headers["dA-minor-version"] = "20160316"
|
|
|
|
self.log = extractor.log
|
2017-01-12 21:08:49 +01:00
|
|
|
self.client_id = client_id
|
|
|
|
self.client_secret = client_secret
|
2017-03-08 16:40:20 +01:00
|
|
|
self.delay = 0
|
2017-01-12 21:08:49 +01:00
|
|
|
|
|
|
|
def gallery_all(self, username, offset=0):
|
|
|
|
"""Yield all Deviation-objects of a specific user """
|
|
|
|
url = "https://www.deviantart.com/api/v1/oauth2/gallery/all"
|
2017-03-08 16:40:20 +01:00
|
|
|
params = {"username": username, "offset": offset, "limit": 10}
|
2017-01-12 21:08:49 +01:00
|
|
|
while True:
|
2017-03-08 16:40:20 +01:00
|
|
|
data = self._call(url, params)
|
|
|
|
if "results" in data:
|
|
|
|
yield from data["results"]
|
|
|
|
if not data["has_more"]:
|
|
|
|
return
|
|
|
|
params["offset"] = data["next_offset"]
|
|
|
|
else:
|
|
|
|
self.log.error("Unexpected API response: %s", data)
|
2017-03-13 21:42:16 +01:00
|
|
|
return
|
2017-01-12 21:08:49 +01:00
|
|
|
|
|
|
|
def authenticate(self):
|
|
|
|
"""Authenticate the application by requesting a bearer token"""
|
|
|
|
bearer_token = self._authenticate_impl(
|
|
|
|
self.client_id, self.client_secret
|
|
|
|
)
|
|
|
|
self.session.headers["Authorization"] = bearer_token
|
|
|
|
|
|
|
|
@cache(maxage=3600, keyarg=1)
|
|
|
|
def _authenticate_impl(self, client_id, client_secret):
|
2017-03-08 16:40:20 +01:00
|
|
|
"""Actual authenticate implementation"""
|
2017-01-12 21:08:49 +01:00
|
|
|
url = "https://www.deviantart.com/oauth2/token"
|
|
|
|
data = {
|
|
|
|
"grant_type": "client_credentials",
|
|
|
|
"client_id": client_id,
|
|
|
|
"client_secret": client_secret,
|
|
|
|
}
|
|
|
|
response = self.session.post(url, data=data)
|
|
|
|
if response.status_code != 200:
|
2017-03-08 16:40:20 +01:00
|
|
|
raise exception.AuthenticationError()
|
2017-01-12 21:08:49 +01:00
|
|
|
return "Bearer " + response.json()["access_token"]
|
2017-03-08 16:40:20 +01:00
|
|
|
|
|
|
|
def _call(self, url, params={}):
|
|
|
|
"""Call an API endpoint"""
|
2017-03-13 21:42:16 +01:00
|
|
|
tries = 1
|
2017-03-08 16:40:20 +01:00
|
|
|
while True:
|
|
|
|
if self.delay:
|
|
|
|
time.sleep(self.delay)
|
|
|
|
|
2017-03-13 21:42:16 +01:00
|
|
|
self.authenticate()
|
2017-03-08 16:40:20 +01:00
|
|
|
response = self.session.get(url, params=params)
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
break
|
|
|
|
elif response.status_code == 429:
|
|
|
|
self.delay += 1
|
|
|
|
self.log.debug("rate limit (delay: %d)", self.delay)
|
|
|
|
else:
|
|
|
|
self.delay = 1
|
2017-03-13 21:42:16 +01:00
|
|
|
self.log.debug("http status code %d (%d/3)",
|
|
|
|
response.status_code, tries)
|
2017-03-08 16:40:20 +01:00
|
|
|
tries += 1
|
2017-03-13 21:42:16 +01:00
|
|
|
if tries > 3:
|
2017-03-08 16:40:20 +01:00
|
|
|
raise Exception(response.text)
|
|
|
|
try:
|
|
|
|
return response.json()
|
|
|
|
except ValueError:
|
|
|
|
return {}
|