2015-10-11 16:22:38 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-04-21 14:22:26 +02:00
|
|
|
# Copyright 2015-2023 Mike Fährmann
|
2015-10-11 16:22:38 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2020-02-23 16:48:30 +01:00
|
|
|
"""Extractors for https://imgur.com/"""
|
2015-10-11 16:22:38 +02:00
|
|
|
|
|
|
|
from .common import Extractor, Message
|
2016-10-07 00:13:51 +02:00
|
|
|
from .. import text, exception
|
2015-10-11 16:22:38 +02:00
|
|
|
|
2022-12-17 07:27:15 +01:00
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:www\.|[im]\.)?imgur\.(?:com|io)"
|
2019-09-19 15:54:26 +02:00
|
|
|
|
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
class ImgurExtractor(Extractor):
|
|
|
|
"""Base class for imgur extractors"""
|
|
|
|
category = "imgur"
|
2019-07-04 23:45:26 +02:00
|
|
|
root = "https://imgur.com"
|
2017-05-26 22:30:09 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
Extractor.__init__(self, match)
|
2019-08-14 21:20:58 +02:00
|
|
|
self.key = match.group(1)
|
2023-07-25 20:09:44 +02:00
|
|
|
|
|
|
|
def _init(self):
|
|
|
|
self.api = ImgurAPI(self)
|
2017-05-29 08:48:07 +02:00
|
|
|
self.mp4 = self.config("mp4", True)
|
2017-05-26 22:30:09 +02:00
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
def _prepare(self, image):
|
2020-09-05 23:53:33 +02:00
|
|
|
image.update(image["metadata"])
|
|
|
|
del image["metadata"]
|
2017-05-26 22:30:09 +02:00
|
|
|
|
2020-09-05 23:53:33 +02:00
|
|
|
if image["ext"] == "jpeg":
|
|
|
|
image["ext"] = "jpg"
|
|
|
|
elif image["is_animated"] and self.mp4 and image["ext"] == "gif":
|
|
|
|
image["ext"] = "mp4"
|
2020-05-22 02:21:47 +02:00
|
|
|
|
2020-09-05 23:53:33 +02:00
|
|
|
image["url"] = url = "https://i.imgur.com/{}.{}".format(
|
|
|
|
image["id"], image["ext"])
|
|
|
|
image["date"] = text.parse_datetime(image["created_at"])
|
2019-10-22 23:51:41 +02:00
|
|
|
text.nameext_from_url(url, image)
|
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
return url
|
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
def _items_queue(self, items):
|
2019-09-19 15:54:26 +02:00
|
|
|
album_ex = ImgurAlbumExtractor
|
|
|
|
image_ex = ImgurImageExtractor
|
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
for item in items:
|
2023-05-06 14:58:42 +02:00
|
|
|
if item["is_album"]:
|
|
|
|
url = "https://imgur.com/a/" + item["id"]
|
|
|
|
item["_extractor"] = album_ex
|
|
|
|
else:
|
|
|
|
url = "https://imgur.com/" + item["id"]
|
|
|
|
item["_extractor"] = image_ex
|
|
|
|
yield Message.Queue, url, item
|
2019-09-19 15:54:26 +02:00
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
|
|
|
|
class ImgurImageExtractor(ImgurExtractor):
|
2019-08-14 21:20:58 +02:00
|
|
|
"""Extractor for individual images on imgur.com"""
|
2017-05-26 22:30:09 +02:00
|
|
|
subcategory = "image"
|
2019-10-22 23:51:41 +02:00
|
|
|
filename_fmt = "{category}_{id}{title:?_//}.{extension}"
|
|
|
|
archive_fmt = "{id}"
|
2021-05-19 15:44:10 +02:00
|
|
|
pattern = (BASE_PATTERN + r"/(?!gallery|search)"
|
|
|
|
r"(?:r/\w+/)?(\w{7}|\w{5})[sbtmlh]?")
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/abcdefg"
|
2017-05-26 22:30:09 +02:00
|
|
|
|
|
|
|
def items(self):
|
2019-10-22 23:51:41 +02:00
|
|
|
image = self.api.image(self.key)
|
2020-09-05 23:53:33 +02:00
|
|
|
|
|
|
|
try:
|
|
|
|
del image["ad_url"]
|
|
|
|
del image["ad_type"]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
image.update(image["media"][0])
|
|
|
|
del image["media"]
|
2017-05-26 22:30:09 +02:00
|
|
|
url = self._prepare(image)
|
|
|
|
yield Message.Directory, image
|
|
|
|
yield Message.Url, url, image
|
|
|
|
|
|
|
|
|
|
|
|
class ImgurAlbumExtractor(ImgurExtractor):
|
2019-08-14 21:20:58 +02:00
|
|
|
"""Extractor for imgur albums"""
|
2016-08-01 16:02:25 +02:00
|
|
|
subcategory = "album"
|
2019-10-22 23:51:41 +02:00
|
|
|
directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}")
|
|
|
|
filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}"
|
|
|
|
archive_fmt = "{album[id]}_{id}"
|
2020-05-22 02:32:12 +02:00
|
|
|
pattern = BASE_PATTERN + r"/a/(\w{7}|\w{5})"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/a/abcde"
|
2015-11-21 04:26:30 +01:00
|
|
|
|
2015-10-11 16:22:38 +02:00
|
|
|
def items(self):
|
2019-10-22 23:51:41 +02:00
|
|
|
album = self.api.album(self.key)
|
2020-09-05 23:53:33 +02:00
|
|
|
|
2022-05-04 12:01:09 +02:00
|
|
|
try:
|
|
|
|
images = album["media"]
|
|
|
|
except KeyError:
|
|
|
|
return
|
|
|
|
|
2020-09-05 23:53:33 +02:00
|
|
|
del album["media"]
|
2020-06-21 19:25:52 +02:00
|
|
|
count = len(images)
|
2022-05-04 12:01:09 +02:00
|
|
|
album["date"] = text.parse_datetime(album["created_at"])
|
2017-05-26 22:30:09 +02:00
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
try:
|
2020-09-05 23:53:33 +02:00
|
|
|
del album["ad_url"]
|
|
|
|
del album["ad_type"]
|
2019-10-22 23:51:41 +02:00
|
|
|
except KeyError:
|
|
|
|
pass
|
2017-07-18 12:42:19 +02:00
|
|
|
|
2017-05-26 22:30:09 +02:00
|
|
|
for num, image in enumerate(images, 1):
|
|
|
|
url = self._prepare(image)
|
2016-10-07 00:13:51 +02:00
|
|
|
image["num"] = num
|
2020-06-21 19:25:52 +02:00
|
|
|
image["count"] = count
|
2017-05-26 22:30:09 +02:00
|
|
|
image["album"] = album
|
2020-06-21 19:25:52 +02:00
|
|
|
yield Message.Directory, image
|
2016-10-07 00:13:51 +02:00
|
|
|
yield Message.Url, url, image
|
2019-08-14 21:20:58 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ImgurGalleryExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for imgur galleries"""
|
|
|
|
subcategory = "gallery"
|
2020-07-09 22:17:01 +02:00
|
|
|
pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(\w{7}|\w{5})"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/gallery/abcde"
|
2019-08-14 21:20:58 +02:00
|
|
|
|
|
|
|
def items(self):
|
2020-09-05 23:53:33 +02:00
|
|
|
if self.api.gallery(self.key)["is_album"]:
|
|
|
|
url = "{}/a/{}".format(self.root, self.key)
|
|
|
|
extr = ImgurAlbumExtractor
|
|
|
|
else:
|
|
|
|
url = "{}/{}".format(self.root, self.key)
|
|
|
|
extr = ImgurImageExtractor
|
2019-08-20 20:00:43 +02:00
|
|
|
yield Message.Queue, url, {"_extractor": extr}
|
2019-09-17 22:58:18 +02:00
|
|
|
|
|
|
|
|
|
|
|
class ImgurUserExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for all images posted by a user"""
|
|
|
|
subcategory = "user"
|
2020-10-22 23:12:59 +02:00
|
|
|
pattern = BASE_PATTERN + r"/user/([^/?#]+)(?:/posts|/submitted)?/?$"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/user/USER"
|
2019-09-17 22:58:18 +02:00
|
|
|
|
|
|
|
def items(self):
|
2019-10-22 23:51:41 +02:00
|
|
|
return self._items_queue(self.api.account_submissions(self.key))
|
2019-09-17 22:58:18 +02:00
|
|
|
|
|
|
|
|
2019-09-19 15:54:26 +02:00
|
|
|
class ImgurFavoriteExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for a user's favorites"""
|
|
|
|
subcategory = "favorite"
|
2023-05-06 14:52:43 +02:00
|
|
|
pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/?$"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/user/USER/favorites"
|
2019-09-17 22:58:18 +02:00
|
|
|
|
2019-09-19 15:54:26 +02:00
|
|
|
def items(self):
|
2019-10-22 23:51:41 +02:00
|
|
|
return self._items_queue(self.api.account_favorites(self.key))
|
|
|
|
|
|
|
|
|
2023-05-06 14:52:43 +02:00
|
|
|
class ImgurFavoriteFolderExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for a user's favorites folder"""
|
|
|
|
subcategory = "favorite-folder"
|
|
|
|
pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/folder/(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/user/USER/favorites/folder/12345/TITLE"
|
2023-05-06 14:52:43 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
ImgurExtractor.__init__(self, match)
|
|
|
|
self.folder_id = match.group(2)
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
return self._items_queue(self.api.account_favorites_folder(
|
|
|
|
self.key, self.folder_id))
|
|
|
|
|
|
|
|
|
2019-12-02 22:34:34 +01:00
|
|
|
class ImgurSubredditExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for a subreddits's imgur links"""
|
|
|
|
subcategory = "subreddit"
|
2021-05-19 15:44:10 +02:00
|
|
|
pattern = BASE_PATTERN + r"/r/([^/?#]+)/?$"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/r/SUBREDDIT"
|
2019-12-02 22:34:34 +01:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
return self._items_queue(self.api.gallery_subreddit(self.key))
|
|
|
|
|
|
|
|
|
2020-08-26 22:03:07 +02:00
|
|
|
class ImgurTagExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for imgur tag searches"""
|
|
|
|
subcategory = "tag"
|
2020-10-22 23:12:59 +02:00
|
|
|
pattern = BASE_PATTERN + r"/t/([^/?#]+)$"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/t/TAG"
|
2020-08-26 22:03:07 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
return self._items_queue(self.api.gallery_tag(self.key))
|
|
|
|
|
|
|
|
|
2020-08-26 22:26:48 +02:00
|
|
|
class ImgurSearchExtractor(ImgurExtractor):
|
|
|
|
"""Extractor for imgur search results"""
|
|
|
|
subcategory = "search"
|
2020-10-22 23:12:59 +02:00
|
|
|
pattern = BASE_PATTERN + r"/search(?:/[^?#]+)?/?\?q=([^&#]+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://imgur.com/search?q=UERY"
|
2020-08-26 22:26:48 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
key = text.unquote(self.key.replace("+", " "))
|
|
|
|
return self._items_queue(self.api.gallery_search(key))
|
|
|
|
|
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
class ImgurAPI():
|
2020-08-26 22:03:07 +02:00
|
|
|
"""Interface for the Imgur API
|
2019-10-22 23:51:41 +02:00
|
|
|
|
2020-08-26 22:03:07 +02:00
|
|
|
Ref: https://apidocs.imgur.com/
|
|
|
|
"""
|
2019-10-22 23:51:41 +02:00
|
|
|
def __init__(self, extractor):
|
|
|
|
self.extractor = extractor
|
2023-05-06 14:52:43 +02:00
|
|
|
self.client_id = extractor.config("client-id") or "546c25a59c58ad7"
|
|
|
|
self.headers = {"Authorization": "Client-ID " + self.client_id}
|
2019-10-22 23:51:41 +02:00
|
|
|
|
|
|
|
def account_favorites(self, account):
|
2020-09-05 23:53:33 +02:00
|
|
|
endpoint = "/3/account/{}/gallery_favorites".format(account)
|
2019-10-22 23:51:41 +02:00
|
|
|
return self._pagination(endpoint)
|
|
|
|
|
2023-05-06 14:52:43 +02:00
|
|
|
def account_favorites_folder(self, account, folder_id):
|
|
|
|
endpoint = "/3/account/{}/folders/{}/favorites".format(
|
|
|
|
account, folder_id)
|
|
|
|
return self._pagination_v2(endpoint)
|
|
|
|
|
2020-08-26 22:26:48 +02:00
|
|
|
def gallery_search(self, query):
|
2020-09-05 23:53:33 +02:00
|
|
|
endpoint = "/3/gallery/search"
|
2020-08-26 22:26:48 +02:00
|
|
|
params = {"q": query}
|
|
|
|
return self._pagination(endpoint, params)
|
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
def account_submissions(self, account):
|
2020-09-05 23:53:33 +02:00
|
|
|
endpoint = "/3/account/{}/submissions".format(account)
|
2019-10-22 23:51:41 +02:00
|
|
|
return self._pagination(endpoint)
|
|
|
|
|
2019-12-02 22:34:34 +01:00
|
|
|
def gallery_subreddit(self, subreddit):
|
2020-09-05 23:53:33 +02:00
|
|
|
endpoint = "/3/gallery/r/{}".format(subreddit)
|
2019-12-02 22:34:34 +01:00
|
|
|
return self._pagination(endpoint)
|
|
|
|
|
2020-08-26 22:03:07 +02:00
|
|
|
def gallery_tag(self, tag):
|
2020-09-05 23:53:33 +02:00
|
|
|
endpoint = "/3/gallery/t/{}".format(tag)
|
2020-08-26 22:03:07 +02:00
|
|
|
return self._pagination(endpoint, key="items")
|
|
|
|
|
2020-09-05 23:53:33 +02:00
|
|
|
def image(self, image_hash):
|
|
|
|
endpoint = "/post/v1/media/" + image_hash
|
|
|
|
params = {"include": "media,tags,account"}
|
|
|
|
return self._call(endpoint, params)
|
|
|
|
|
2019-10-22 23:51:41 +02:00
|
|
|
def album(self, album_hash):
|
2020-09-05 23:53:33 +02:00
|
|
|
endpoint = "/post/v1/albums/" + album_hash
|
|
|
|
params = {"include": "media,tags,account"}
|
|
|
|
return self._call(endpoint, params)
|
2019-10-22 23:51:41 +02:00
|
|
|
|
2020-09-05 23:53:33 +02:00
|
|
|
def gallery(self, gallery_hash):
|
|
|
|
endpoint = "/post/v1/posts/" + gallery_hash
|
|
|
|
return self._call(endpoint)
|
2019-10-22 23:51:41 +02:00
|
|
|
|
2023-05-06 14:52:43 +02:00
|
|
|
def _call(self, endpoint, params=None, headers=None):
|
2021-03-18 15:45:26 +01:00
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
return self.extractor.request(
|
|
|
|
"https://api.imgur.com" + endpoint,
|
2023-05-06 14:52:43 +02:00
|
|
|
params=params, headers=(headers or self.headers),
|
2021-03-18 15:45:26 +01:00
|
|
|
).json()
|
|
|
|
except exception.HttpError as exc:
|
|
|
|
if exc.status not in (403, 429) or \
|
|
|
|
b"capacity" not in exc.response.content:
|
|
|
|
raise
|
|
|
|
self.extractor.wait(seconds=600)
|
2019-10-22 23:51:41 +02:00
|
|
|
|
2020-08-26 22:26:48 +02:00
|
|
|
def _pagination(self, endpoint, params=None, key=None):
|
2019-10-22 23:51:41 +02:00
|
|
|
num = 0
|
|
|
|
|
|
|
|
while True:
|
2020-09-05 23:53:33 +02:00
|
|
|
data = self._call("{}/{}".format(endpoint, num), params)["data"]
|
2020-08-26 22:03:07 +02:00
|
|
|
if key:
|
|
|
|
data = data[key]
|
2019-10-22 23:51:41 +02:00
|
|
|
if not data:
|
|
|
|
return
|
|
|
|
yield from data
|
|
|
|
num += 1
|
2023-05-06 14:52:43 +02:00
|
|
|
|
|
|
|
def _pagination_v2(self, endpoint, params=None, key=None):
|
|
|
|
if params is None:
|
|
|
|
params = {}
|
|
|
|
params["client_id"] = self.client_id
|
|
|
|
params["page"] = 0
|
|
|
|
params["sort"] = "newest"
|
2023-09-18 23:50:25 +02:00
|
|
|
headers = {"Origin": "https://imgur.com"}
|
2023-05-06 14:52:43 +02:00
|
|
|
|
|
|
|
while True:
|
|
|
|
data = self._call(endpoint, params, headers)["data"]
|
|
|
|
if not data:
|
|
|
|
return
|
|
|
|
yield from data
|
|
|
|
|
|
|
|
params["page"] += 1
|