1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-25 12:12:34 +01:00
gallery-dl/gallery_dl/extractor/imgur.py

298 lines
9.3 KiB
Python
Raw Normal View History

2015-10-11 16:22:38 +02:00
# -*- coding: utf-8 -*-
# Copyright 2015-2023 Mike Fährmann
2015-10-11 16:22:38 +02:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
2020-02-23 16:48:30 +01:00
"""Extractors for https://imgur.com/"""
2015-10-11 16:22:38 +02:00
from .common import Extractor, Message
2016-10-07 00:13:51 +02:00
from .. import text, exception
2015-10-11 16:22:38 +02:00
2022-12-17 07:27:15 +01:00
BASE_PATTERN = r"(?:https?://)?(?:www\.|[im]\.)?imgur\.(?:com|io)"
class ImgurExtractor(Extractor):
"""Base class for imgur extractors"""
category = "imgur"
root = "https://imgur.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.key = match.group(1)
def _init(self):
self.api = ImgurAPI(self)
self.mp4 = self.config("mp4", True)
def _prepare(self, image):
image.update(image["metadata"])
del image["metadata"]
if image["ext"] == "jpeg":
image["ext"] = "jpg"
elif image["is_animated"] and self.mp4 and image["ext"] == "gif":
image["ext"] = "mp4"
image["url"] = url = "https://i.imgur.com/{}.{}".format(
image["id"], image["ext"])
image["date"] = text.parse_datetime(image["created_at"])
image["_http_validate"] = self._validate
text.nameext_from_url(url, image)
return url
def _validate(self, response):
return (not response.history or
not response.url.endswith("/removed.png"))
def _items_queue(self, items):
album_ex = ImgurAlbumExtractor
image_ex = ImgurImageExtractor
for item in items:
if item["is_album"]:
url = "https://imgur.com/a/" + item["id"]
item["_extractor"] = album_ex
else:
url = "https://imgur.com/" + item["id"]
item["_extractor"] = image_ex
yield Message.Queue, url, item
class ImgurImageExtractor(ImgurExtractor):
"""Extractor for individual images on imgur.com"""
subcategory = "image"
filename_fmt = "{category}_{id}{title:?_//}.{extension}"
archive_fmt = "{id}"
2021-05-19 15:44:10 +02:00
pattern = (BASE_PATTERN + r"/(?!gallery|search)"
r"(?:r/\w+/)?(?:[^/?#]+-)?(\w{7}|\w{5})[sbtmlh]?")
example = "https://imgur.com/abcdefg"
def items(self):
image = self.api.image(self.key)
try:
del image["ad_url"]
del image["ad_type"]
except KeyError:
pass
image.update(image["media"][0])
del image["media"]
url = self._prepare(image)
yield Message.Directory, image
yield Message.Url, url, image
class ImgurAlbumExtractor(ImgurExtractor):
"""Extractor for imgur albums"""
subcategory = "album"
directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}")
filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}"
archive_fmt = "{album[id]}_{id}"
pattern = BASE_PATTERN + r"/a/(?:[^/?#]+-)?(\w{7}|\w{5})"
example = "https://imgur.com/a/abcde"
2015-11-21 04:26:30 +01:00
2015-10-11 16:22:38 +02:00
def items(self):
album = self.api.album(self.key)
try:
images = album["media"]
except KeyError:
return
del album["media"]
count = len(images)
album["date"] = text.parse_datetime(album["created_at"])
try:
del album["ad_url"]
del album["ad_type"]
except KeyError:
pass
2017-07-18 12:42:19 +02:00
for num, image in enumerate(images, 1):
url = self._prepare(image)
2016-10-07 00:13:51 +02:00
image["num"] = num
image["count"] = count
image["album"] = album
yield Message.Directory, image
2016-10-07 00:13:51 +02:00
yield Message.Url, url, image
class ImgurGalleryExtractor(ImgurExtractor):
"""Extractor for imgur galleries"""
subcategory = "gallery"
pattern = BASE_PATTERN + r"/(?:gallery|t/\w+)/(?:[^/?#]+-)?(\w{7}|\w{5})"
example = "https://imgur.com/gallery/abcde"
def items(self):
if self.api.gallery(self.key)["is_album"]:
url = "{}/a/{}".format(self.root, self.key)
extr = ImgurAlbumExtractor
else:
url = "{}/{}".format(self.root, self.key)
extr = ImgurImageExtractor
2019-08-20 20:00:43 +02:00
yield Message.Queue, url, {"_extractor": extr}
2019-09-17 22:58:18 +02:00
class ImgurUserExtractor(ImgurExtractor):
"""Extractor for all images posted by a user"""
subcategory = "user"
pattern = BASE_PATTERN + r"/user/([^/?#]+)(?:/posts|/submitted)?/?$"
example = "https://imgur.com/user/USER"
2019-09-17 22:58:18 +02:00
def items(self):
return self._items_queue(self.api.account_submissions(self.key))
2019-09-17 22:58:18 +02:00
class ImgurFavoriteExtractor(ImgurExtractor):
"""Extractor for a user's favorites"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/?$"
example = "https://imgur.com/user/USER/favorites"
2019-09-17 22:58:18 +02:00
def items(self):
return self._items_queue(self.api.account_favorites(self.key))
class ImgurFavoriteFolderExtractor(ImgurExtractor):
"""Extractor for a user's favorites folder"""
subcategory = "favorite-folder"
pattern = BASE_PATTERN + r"/user/([^/?#]+)/favorites/folder/(\d+)"
example = "https://imgur.com/user/USER/favorites/folder/12345/TITLE"
def __init__(self, match):
ImgurExtractor.__init__(self, match)
self.folder_id = match.group(2)
def items(self):
return self._items_queue(self.api.account_favorites_folder(
self.key, self.folder_id))
class ImgurSubredditExtractor(ImgurExtractor):
"""Extractor for a subreddits's imgur links"""
subcategory = "subreddit"
2021-05-19 15:44:10 +02:00
pattern = BASE_PATTERN + r"/r/([^/?#]+)/?$"
example = "https://imgur.com/r/SUBREDDIT"
def items(self):
return self._items_queue(self.api.gallery_subreddit(self.key))
2020-08-26 22:03:07 +02:00
class ImgurTagExtractor(ImgurExtractor):
"""Extractor for imgur tag searches"""
subcategory = "tag"
pattern = BASE_PATTERN + r"/t/([^/?#]+)$"
example = "https://imgur.com/t/TAG"
2020-08-26 22:03:07 +02:00
def items(self):
return self._items_queue(self.api.gallery_tag(self.key))
2020-08-26 22:26:48 +02:00
class ImgurSearchExtractor(ImgurExtractor):
"""Extractor for imgur search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/search(?:/[^?#]+)?/?\?q=([^&#]+)"
example = "https://imgur.com/search?q=UERY"
2020-08-26 22:26:48 +02:00
def items(self):
key = text.unquote(self.key.replace("+", " "))
return self._items_queue(self.api.gallery_search(key))
class ImgurAPI():
2020-08-26 22:03:07 +02:00
"""Interface for the Imgur API
2020-08-26 22:03:07 +02:00
Ref: https://apidocs.imgur.com/
"""
def __init__(self, extractor):
self.extractor = extractor
self.client_id = extractor.config("client-id") or "546c25a59c58ad7"
self.headers = {"Authorization": "Client-ID " + self.client_id}
def account_favorites(self, account):
endpoint = "/3/account/{}/gallery_favorites".format(account)
return self._pagination(endpoint)
def account_favorites_folder(self, account, folder_id):
endpoint = "/3/account/{}/folders/{}/favorites".format(
account, folder_id)
return self._pagination_v2(endpoint)
2020-08-26 22:26:48 +02:00
def gallery_search(self, query):
endpoint = "/3/gallery/search"
2020-08-26 22:26:48 +02:00
params = {"q": query}
return self._pagination(endpoint, params)
def account_submissions(self, account):
endpoint = "/3/account/{}/submissions".format(account)
return self._pagination(endpoint)
def gallery_subreddit(self, subreddit):
endpoint = "/3/gallery/r/{}".format(subreddit)
return self._pagination(endpoint)
2020-08-26 22:03:07 +02:00
def gallery_tag(self, tag):
endpoint = "/3/gallery/t/{}".format(tag)
2020-08-26 22:03:07 +02:00
return self._pagination(endpoint, key="items")
def image(self, image_hash):
endpoint = "/post/v1/media/" + image_hash
params = {"include": "media,tags,account"}
return self._call(endpoint, params)
def album(self, album_hash):
endpoint = "/post/v1/albums/" + album_hash
params = {"include": "media,tags,account"}
return self._call(endpoint, params)
def gallery(self, gallery_hash):
endpoint = "/post/v1/posts/" + gallery_hash
return self._call(endpoint)
def _call(self, endpoint, params=None, headers=None):
while True:
try:
return self.extractor.request(
"https://api.imgur.com" + endpoint,
params=params, headers=(headers or self.headers),
).json()
except exception.HttpError as exc:
if exc.status not in (403, 429) or \
b"capacity" not in exc.response.content:
raise
self.extractor.wait(seconds=600)
2020-08-26 22:26:48 +02:00
def _pagination(self, endpoint, params=None, key=None):
num = 0
while True:
data = self._call("{}/{}".format(endpoint, num), params)["data"]
2020-08-26 22:03:07 +02:00
if key:
data = data[key]
if not data:
return
yield from data
num += 1
def _pagination_v2(self, endpoint, params=None, key=None):
if params is None:
params = {}
params["client_id"] = self.client_id
params["page"] = 0
params["sort"] = "newest"
2023-09-18 23:50:25 +02:00
headers = {"Origin": "https://imgur.com"}
while True:
data = self._call(endpoint, params, headers)["data"]
if not data:
return
yield from data
params["page"] += 1