mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-23 11:12:40 +01:00
4963bb9b30
provide main submission metadata at the top level
and comment metadata inside the 'comment' field,
i.e. the other way round than in 1710f1e9
508 lines
19 KiB
Python
508 lines
19 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2017-2023 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://www.reddit.com/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text, util, exception
|
|
from ..cache import cache
|
|
|
|
|
|
class RedditExtractor(Extractor):
|
|
"""Base class for reddit extractors"""
|
|
category = "reddit"
|
|
directory_fmt = ("{category}", "{subreddit}")
|
|
filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}"
|
|
archive_fmt = "{filename}"
|
|
cookies_domain = ".reddit.com"
|
|
request_interval = 0.6
|
|
|
|
def items(self):
|
|
self.api = RedditAPI(self)
|
|
match_submission = RedditSubmissionExtractor.pattern.match
|
|
match_subreddit = RedditSubredditExtractor.pattern.match
|
|
match_user = RedditUserExtractor.pattern.match
|
|
|
|
parentdir = self.config("parent-directory")
|
|
max_depth = self.config("recursion", 0)
|
|
|
|
videos = self.config("videos", True)
|
|
if videos:
|
|
if videos == "ytdl":
|
|
self._extract_video = self._extract_video_ytdl
|
|
elif videos == "dash":
|
|
self._extract_video = self._extract_video_dash
|
|
videos = True
|
|
|
|
submissions = self.submissions()
|
|
visited = set()
|
|
depth = 0
|
|
|
|
while True:
|
|
extra = []
|
|
|
|
for submission, comments in submissions:
|
|
urls = []
|
|
|
|
if submission:
|
|
submission["date"] = text.parse_timestamp(
|
|
submission["created_utc"])
|
|
yield Message.Directory, submission
|
|
visited.add(submission["id"])
|
|
submission["num"] = 0
|
|
|
|
if "crosspost_parent_list" in submission:
|
|
try:
|
|
media = submission["crosspost_parent_list"][-1]
|
|
except Exception:
|
|
media = submission
|
|
else:
|
|
media = submission
|
|
|
|
url = media["url"]
|
|
if url and url.startswith((
|
|
"https://i.redd.it/",
|
|
"https://preview.redd.it/",
|
|
)):
|
|
text.nameext_from_url(url, submission)
|
|
yield Message.Url, url, submission
|
|
|
|
elif "gallery_data" in media:
|
|
for submission["num"], url in enumerate(
|
|
self._extract_gallery(media), 1):
|
|
text.nameext_from_url(url, submission)
|
|
yield Message.Url, url, submission
|
|
|
|
elif media["is_video"]:
|
|
if videos:
|
|
text.nameext_from_url(url, submission)
|
|
url = "ytdl:" + self._extract_video(media)
|
|
yield Message.Url, url, submission
|
|
|
|
elif not submission["is_self"]:
|
|
urls.append((url, submission))
|
|
|
|
elif parentdir:
|
|
yield Message.Directory, comments[0]
|
|
|
|
if self.api.comments:
|
|
if submission:
|
|
for url in text.extract_iter(
|
|
submission["selftext_html"] or "",
|
|
' href="', '"'):
|
|
urls.append((url, submission))
|
|
for comment in comments:
|
|
html = comment["body_html"] or ""
|
|
if ' href="' in html:
|
|
comment["date"] = text.parse_timestamp(
|
|
comment["created_utc"])
|
|
if submission:
|
|
data = submission.copy()
|
|
data["comment"] = comment
|
|
else:
|
|
data = comment
|
|
for url in text.extract_iter(html, ' href="', '"'):
|
|
urls.append((url, data))
|
|
|
|
for url, data in urls:
|
|
if not url or url[0] == "#":
|
|
continue
|
|
if url[0] == "/":
|
|
url = "https://www.reddit.com" + url
|
|
|
|
match = match_submission(url)
|
|
if match:
|
|
extra.append(match.group(1))
|
|
elif not match_user(url) and not match_subreddit(url):
|
|
if "preview" in data:
|
|
data["_fallback"] = self._previews(data)
|
|
yield Message.Queue, text.unescape(url), data
|
|
if "_fallback" in data:
|
|
del data["_fallback"]
|
|
|
|
if not extra or depth == max_depth:
|
|
return
|
|
depth += 1
|
|
submissions = (
|
|
self.api.submission(sid) for sid in extra
|
|
if sid not in visited
|
|
)
|
|
|
|
def submissions(self):
|
|
"""Return an iterable containing all (submission, comments) tuples"""
|
|
|
|
def _extract_gallery(self, submission):
|
|
gallery = submission["gallery_data"]
|
|
if gallery is None:
|
|
self.log.warning("gallery %s: deleted", submission["id"])
|
|
return
|
|
|
|
meta = submission.get("media_metadata")
|
|
if meta is None:
|
|
self.log.warning("gallery %s: missing 'media_metadata'",
|
|
submission["id"])
|
|
return
|
|
|
|
for item in gallery["items"]:
|
|
data = meta[item["media_id"]]
|
|
if data["status"] != "valid" or "s" not in data:
|
|
self.log.warning(
|
|
"gallery %s: skipping item %s ('status: %s')",
|
|
submission["id"], item["media_id"], data.get("status"))
|
|
continue
|
|
src = data["s"]
|
|
url = src.get("u") or src.get("gif") or src.get("mp4")
|
|
if url:
|
|
yield url.partition("?")[0].replace("/preview.", "/i.", 1)
|
|
else:
|
|
self.log.error(
|
|
"gallery %s: unable to fetch download URL for item %s",
|
|
submission["id"], item["media_id"])
|
|
self.log.debug(src)
|
|
|
|
def _extract_video_ytdl(self, submission):
|
|
return "https://www.reddit.com" + submission["permalink"]
|
|
|
|
def _extract_video_dash(self, submission):
|
|
submission["_ytdl_extra"] = {"title": submission["title"]}
|
|
try:
|
|
return (submission["secure_media"]["reddit_video"]["dash_url"] +
|
|
"#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D")
|
|
except Exception:
|
|
return submission["url"]
|
|
|
|
def _extract_video(self, submission):
|
|
submission["_ytdl_extra"] = {"title": submission["title"]}
|
|
return submission["url"]
|
|
|
|
def _previews(self, post):
|
|
try:
|
|
if "reddit_video_preview" in post["preview"]:
|
|
video = post["preview"]["reddit_video_preview"]
|
|
if "dash_url" in video:
|
|
yield "ytdl:" + video["dash_url"]
|
|
if "hls_url" in video:
|
|
yield "ytdl:" + video["hls_url"]
|
|
except Exception as exc:
|
|
self.log.debug("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
try:
|
|
for image in post["preview"]["images"]:
|
|
yield image["source"]["url"]
|
|
except Exception as exc:
|
|
self.log.debug("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
|
|
class RedditSubredditExtractor(RedditExtractor):
|
|
"""Extractor for URLs from subreddits on reddit.com"""
|
|
subcategory = "subreddit"
|
|
pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com"
|
|
r"(/r/[^/?#]+(?:/([a-z]+))?)/?(?:\?([^#]*))?(?:$|#)")
|
|
example = "https://www.reddit.com/r/SUBREDDIT/"
|
|
|
|
def __init__(self, match):
|
|
self.subreddit, sub, params = match.groups()
|
|
self.params = text.parse_query(params)
|
|
if sub:
|
|
self.subcategory += "-" + sub
|
|
RedditExtractor.__init__(self, match)
|
|
|
|
def submissions(self):
|
|
return self.api.submissions_subreddit(self.subreddit, self.params)
|
|
|
|
|
|
class RedditHomeExtractor(RedditSubredditExtractor):
|
|
"""Extractor for submissions from your home feed on reddit.com"""
|
|
subcategory = "home"
|
|
pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com"
|
|
r"((?:/([a-z]+))?)/?(?:\?([^#]*))?(?:$|#)")
|
|
example = "https://www.reddit.com/"
|
|
|
|
|
|
class RedditUserExtractor(RedditExtractor):
|
|
"""Extractor for URLs from posts by a reddit user"""
|
|
subcategory = "user"
|
|
pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/u(?:ser)?/"
|
|
r"([^/?#]+(?:/([a-z]+))?)/?(?:\?([^#]*))?$")
|
|
example = "https://www.reddit.com/user/USER/"
|
|
|
|
def __init__(self, match):
|
|
self.user, sub, params = match.groups()
|
|
self.params = text.parse_query(params)
|
|
if sub:
|
|
self.subcategory += "-" + sub
|
|
RedditExtractor.__init__(self, match)
|
|
|
|
def submissions(self):
|
|
return self.api.submissions_user(self.user, self.params)
|
|
|
|
|
|
class RedditSubmissionExtractor(RedditExtractor):
|
|
"""Extractor for URLs from a submission on reddit.com"""
|
|
subcategory = "submission"
|
|
pattern = (r"(?:https?://)?(?:"
|
|
r"(?:\w+\.)?reddit\.com/(?:(?:r|u|user)/[^/?#]+"
|
|
r"/comments|gallery)|redd\.it)/([a-z0-9]+)")
|
|
example = "https://www.reddit.com/r/SUBREDDIT/comments/id/"
|
|
|
|
def __init__(self, match):
|
|
RedditExtractor.__init__(self, match)
|
|
self.submission_id = match.group(1)
|
|
|
|
def submissions(self):
|
|
return (self.api.submission(self.submission_id),)
|
|
|
|
|
|
class RedditImageExtractor(Extractor):
|
|
"""Extractor for reddit-hosted images"""
|
|
category = "reddit"
|
|
subcategory = "image"
|
|
archive_fmt = "{filename}"
|
|
pattern = (r"(?:https?://)?((?:i|preview)\.redd\.it|i\.reddituploads\.com)"
|
|
r"/([^/?#]+)(\?[^#]*)?")
|
|
example = "https://i.redd.it/NAME.EXT"
|
|
|
|
def __init__(self, match):
|
|
Extractor.__init__(self, match)
|
|
domain = match.group(1)
|
|
self.path = match.group(2)
|
|
if domain == "preview.redd.it":
|
|
self.domain = "i.redd.it"
|
|
self.query = ""
|
|
else:
|
|
self.domain = domain
|
|
self.query = match.group(3) or ""
|
|
|
|
def items(self):
|
|
url = "https://{}/{}{}".format(self.domain, self.path, self.query)
|
|
data = text.nameext_from_url(url)
|
|
yield Message.Directory, data
|
|
yield Message.Url, url, data
|
|
|
|
|
|
class RedditAPI():
|
|
"""Interface for the Reddit API
|
|
|
|
Ref: https://www.reddit.com/dev/api/
|
|
"""
|
|
CLIENT_ID = "6N9uN0krSDE-ig"
|
|
USER_AGENT = "Python:gallery-dl:0.8.4 (by /u/mikf1)"
|
|
|
|
def __init__(self, extractor):
|
|
self.extractor = extractor
|
|
self.log = extractor.log
|
|
|
|
config = extractor.config
|
|
self.comments = text.parse_int(config("comments", 0))
|
|
self.morecomments = config("morecomments", False)
|
|
|
|
client_id = config("client-id")
|
|
if client_id is None:
|
|
self.client_id = self.CLIENT_ID
|
|
self.headers = {"User-Agent": self.USER_AGENT}
|
|
else:
|
|
self.client_id = client_id
|
|
self.headers = {"User-Agent": config("user-agent")}
|
|
|
|
if self.client_id == self.CLIENT_ID:
|
|
client_id = self.client_id
|
|
self._warn_429 = True
|
|
kind = "default"
|
|
else:
|
|
client_id = client_id[:5] + "*" * (len(client_id)-5)
|
|
self._warn_429 = False
|
|
kind = "custom"
|
|
|
|
self.log.debug(
|
|
"Using %s API credentials (client-id %s)", kind, client_id)
|
|
|
|
token = config("refresh-token")
|
|
if token is None or token == "cache":
|
|
key = "#" + self.client_id
|
|
self.refresh_token = _refresh_token_cache(key)
|
|
else:
|
|
self.refresh_token = token
|
|
|
|
if not self.refresh_token:
|
|
# allow downloading from quarantined subreddits (#2180)
|
|
extractor.cookies.set(
|
|
"_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D',
|
|
domain=extractor.cookies_domain)
|
|
|
|
def submission(self, submission_id):
|
|
"""Fetch the (submission, comments)=-tuple for a submission id"""
|
|
endpoint = "/comments/" + submission_id + "/.json"
|
|
link_id = "t3_" + submission_id if self.morecomments else None
|
|
submission, comments = self._call(endpoint, {"limit": self.comments})
|
|
return (submission["data"]["children"][0]["data"],
|
|
self._flatten(comments, link_id) if self.comments else ())
|
|
|
|
def submissions_subreddit(self, subreddit, params):
|
|
"""Collect all (submission, comments)-tuples of a subreddit"""
|
|
endpoint = subreddit + "/.json"
|
|
params["limit"] = 100
|
|
return self._pagination(endpoint, params)
|
|
|
|
def submissions_user(self, user, params):
|
|
"""Collect all (submission, comments)-tuples posted by a user"""
|
|
endpoint = "/user/" + user + "/.json"
|
|
params["limit"] = 100
|
|
return self._pagination(endpoint, params)
|
|
|
|
def morechildren(self, link_id, children):
|
|
"""Load additional comments from a submission"""
|
|
endpoint = "/api/morechildren"
|
|
params = {"link_id": link_id, "api_type": "json"}
|
|
index, done = 0, False
|
|
while not done:
|
|
if len(children) - index < 100:
|
|
done = True
|
|
params["children"] = ",".join(children[index:index + 100])
|
|
index += 100
|
|
|
|
data = self._call(endpoint, params)["json"]
|
|
for thing in data["data"]["things"]:
|
|
if thing["kind"] == "more":
|
|
children.extend(thing["data"]["children"])
|
|
else:
|
|
yield thing["data"]
|
|
|
|
def authenticate(self):
|
|
"""Authenticate the application by requesting an access token"""
|
|
self.headers["Authorization"] = \
|
|
self._authenticate_impl(self.refresh_token)
|
|
|
|
@cache(maxage=3600, keyarg=1)
|
|
def _authenticate_impl(self, refresh_token=None):
|
|
"""Actual authenticate implementation"""
|
|
url = "https://www.reddit.com/api/v1/access_token"
|
|
self.headers["Authorization"] = None
|
|
|
|
if refresh_token:
|
|
self.log.info("Refreshing private access token")
|
|
data = {"grant_type": "refresh_token",
|
|
"refresh_token": refresh_token}
|
|
else:
|
|
self.log.info("Requesting public access token")
|
|
data = {"grant_type": ("https://oauth.reddit.com/"
|
|
"grants/installed_client"),
|
|
"device_id": "DO_NOT_TRACK_THIS_DEVICE"}
|
|
|
|
response = self.extractor.request(
|
|
url, method="POST", headers=self.headers,
|
|
data=data, auth=(self.client_id, ""), fatal=False)
|
|
data = response.json()
|
|
|
|
if response.status_code != 200:
|
|
self.log.debug("Server response: %s", data)
|
|
raise exception.AuthenticationError('"{}: {}"'.format(
|
|
data.get("error"), data.get("message")))
|
|
return "Bearer " + data["access_token"]
|
|
|
|
def _call(self, endpoint, params):
|
|
url = "https://oauth.reddit.com" + endpoint
|
|
params["raw_json"] = "1"
|
|
|
|
while True:
|
|
self.authenticate()
|
|
response = self.extractor.request(
|
|
url, params=params, headers=self.headers, fatal=None)
|
|
|
|
remaining = response.headers.get("x-ratelimit-remaining")
|
|
if remaining and float(remaining) < 2:
|
|
if self._warn_429:
|
|
self._warn_429 = False
|
|
self.log.info(
|
|
"Register your own OAuth application and use its "
|
|
"credentials to prevent this error: "
|
|
"https://github.com/mikf/gallery-dl/blob/master"
|
|
"/docs/configuration.rst"
|
|
"#extractorredditclient-id--user-agent")
|
|
self.extractor.wait(
|
|
seconds=response.headers["x-ratelimit-reset"])
|
|
continue
|
|
|
|
try:
|
|
data = response.json()
|
|
except ValueError:
|
|
raise exception.StopExtraction(text.remove_html(response.text))
|
|
|
|
if "error" in data:
|
|
if data["error"] == 403:
|
|
raise exception.AuthorizationError()
|
|
if data["error"] == 404:
|
|
raise exception.NotFoundError()
|
|
self.log.debug(data)
|
|
raise exception.StopExtraction(data.get("message"))
|
|
return data
|
|
|
|
def _pagination(self, endpoint, params):
|
|
id_min = self._parse_id("id-min", 0)
|
|
id_max = self._parse_id("id-max", float("inf"))
|
|
if id_max == 2147483647:
|
|
self.log.debug("Ignoring 'id-max' setting \"zik0zj\"")
|
|
id_max = float("inf")
|
|
date_min, date_max = self.extractor._get_date_min_max(0, 253402210800)
|
|
|
|
while True:
|
|
data = self._call(endpoint, params)["data"]
|
|
|
|
for child in data["children"]:
|
|
kind = child["kind"]
|
|
post = child["data"]
|
|
|
|
if (date_min <= post["created_utc"] <= date_max and
|
|
id_min <= self._decode(post["id"]) <= id_max):
|
|
|
|
if kind == "t3":
|
|
if post["num_comments"] and self.comments:
|
|
try:
|
|
yield self.submission(post["id"])
|
|
except exception.AuthorizationError:
|
|
pass
|
|
else:
|
|
yield post, ()
|
|
|
|
elif kind == "t1" and self.comments:
|
|
yield None, (post,)
|
|
|
|
if not data["after"]:
|
|
return
|
|
params["after"] = data["after"]
|
|
|
|
def _flatten(self, comments, link_id=None):
|
|
extra = []
|
|
queue = comments["data"]["children"]
|
|
while queue:
|
|
comment = queue.pop(0)
|
|
if comment["kind"] == "more":
|
|
if link_id:
|
|
extra.extend(comment["data"]["children"])
|
|
continue
|
|
comment = comment["data"]
|
|
yield comment
|
|
if comment["replies"]:
|
|
queue += comment["replies"]["data"]["children"]
|
|
if link_id and extra:
|
|
yield from self.morechildren(link_id, extra)
|
|
|
|
def _parse_id(self, key, default):
|
|
sid = self.extractor.config(key)
|
|
return self._decode(sid.rpartition("_")[2].lower()) if sid else default
|
|
|
|
@staticmethod
|
|
def _decode(sid):
|
|
return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
|
|
|
|
|
|
@cache(maxage=100*365*24*3600, keyarg=0)
|
|
def _refresh_token_cache(token):
|
|
if token and token[0] == "#":
|
|
return None
|
|
return token
|