mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-22 18:53:21 +01:00
[reddit] enable recursion (#15)
reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level
This commit is contained in:
parent
ae686c4c08
commit
99b72130ee
@ -39,8 +39,8 @@ class MangaparkChapterExtractor(Extractor):
|
||||
r"([^/]+/s(\d+)(?:/v([^/]+))?/c(\d+)(?:([^/]+)|/e(\d+))?)")]
|
||||
test = [
|
||||
("http://mangapark.me/manga/gosu/s2/c55", {
|
||||
"url": "482d4a27c1e7f03cff8afac145d06f3ddeac82bb",
|
||||
"keyword": "bd97ca24ef344b44292910384215ef3f1005ea2e",
|
||||
"url": "fefe84492d9118de5962563fbecb9362051c52d5",
|
||||
"keyword": "652b38c40bdfb5592456b6e7524a3acfdef9fae6",
|
||||
}),
|
||||
(("http://mangapark.me/manga/"
|
||||
"ad-astra-per-aspera-hata-kenjirou/s1/c1.2"), {
|
||||
|
@ -9,7 +9,7 @@
|
||||
"""Extract images subreddits at https://reddit.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, exception
|
||||
from .. import text, extractor, exception
|
||||
from ..cache import cache
|
||||
import re
|
||||
|
||||
@ -21,36 +21,54 @@ class RedditExtractor(Extractor):
|
||||
def __init__(self):
|
||||
Extractor.__init__(self)
|
||||
self.api = RedditAPI(self.session, self.log)
|
||||
self.max_depth = int(self.config("recursion", 0))
|
||||
self._visited = set()
|
||||
|
||||
def items(self):
|
||||
regex = re.compile(r"https?://(?:[^.]+\.)?reddit.com/")
|
||||
yield Message.Version, 1
|
||||
for submission, comments in self.submissions():
|
||||
urls = [submission["url"]]
|
||||
urls.extend(
|
||||
text.extract_iter(
|
||||
" ".join(self._collect(submission, comments)),
|
||||
' href="', '"'
|
||||
)
|
||||
)
|
||||
for url in urls:
|
||||
if url[0] == "#":
|
||||
continue
|
||||
elif url[0] == "/":
|
||||
url = "nofollow:https://www.reddit.com" + url
|
||||
elif regex.match(url):
|
||||
url = "nofollow:" + url
|
||||
yield Message.Queue, url
|
||||
subre = re.compile(RedditSubmissionExtractor.pattern[0])
|
||||
submissions = self.submissions()
|
||||
depth = 0
|
||||
|
||||
def _collect(self, submission, comments):
|
||||
yield submission["selftext_html"] or ""
|
||||
for comment in comments:
|
||||
yield comment["body_html"] or ""
|
||||
yield Message.Version, 1
|
||||
with extractor.blacklist("reddit"):
|
||||
while True:
|
||||
extra = []
|
||||
for url in self._urls(submissions):
|
||||
if url[0] == "#":
|
||||
continue
|
||||
if url[0] == "/":
|
||||
url = "https://www.reddit.com" + url
|
||||
|
||||
match = subre.match(url)
|
||||
if match:
|
||||
extra.append(match.group(1))
|
||||
else:
|
||||
yield Message.Queue, url
|
||||
|
||||
if not extra or depth == self.max_depth:
|
||||
return
|
||||
depth += 1
|
||||
submissions = (
|
||||
self.api.submission(sid) for sid in extra
|
||||
if sid not in self._visited
|
||||
)
|
||||
|
||||
def submissions(self):
|
||||
"""Return an iterable containing all (submission, comments) tuples"""
|
||||
|
||||
def _urls(self, submissions):
|
||||
for submission, comments in submissions:
|
||||
self._visited.add(submission["id"])
|
||||
if not submission["is_self"]:
|
||||
yield submission["url"]
|
||||
strings = [submission["selftext_html"] or ""]
|
||||
strings += [c["body_html"] or "" for c in comments]
|
||||
yield from text.extract_iter("".join(strings), ' href="', '"')
|
||||
|
||||
|
||||
class RedditSubredditExtractor(RedditExtractor):
|
||||
"""Extractor for images from subreddits on reddit.com"""
|
||||
subcategory = "submission"
|
||||
subcategory = "subreddit"
|
||||
pattern = [r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/([^/]+)/?$"]
|
||||
|
||||
def __init__(self, match):
|
||||
@ -63,10 +81,11 @@ class RedditSubredditExtractor(RedditExtractor):
|
||||
|
||||
class RedditSubmissionExtractor(RedditExtractor):
|
||||
"""Extractor for images from a submission on reddit.com"""
|
||||
subcategory = "subreddit"
|
||||
pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
|
||||
r"/comments/([a-z0-9]+)"),
|
||||
(r"(?:https?://)?redd\.it/([a-z0-9]+)")]
|
||||
subcategory = "submission"
|
||||
pattern = [(r"(?:https?://)?(?:"
|
||||
r"(?:m\.|www\.)?reddit\.com/r/[^/]+/comments|"
|
||||
r"redd\.it"
|
||||
r")/([a-z0-9]+)")]
|
||||
|
||||
def __init__(self, match):
|
||||
RedditExtractor.__init__(self)
|
||||
@ -119,10 +138,15 @@ class RedditAPI():
|
||||
|
||||
def _call(self, endpoint, params):
|
||||
url = "https://oauth.reddit.com" + endpoint
|
||||
# TODO: handle errors / rate limits
|
||||
self.authenticate()
|
||||
response = self.session.get(url, params=params)
|
||||
return response.json()
|
||||
data = self.session.get(url, params=params).json()
|
||||
if "error" in data:
|
||||
if data["error"] == 403:
|
||||
raise exception.AuthorizationError()
|
||||
if data["error"] == 404:
|
||||
raise exception.NotFoundError()
|
||||
raise Exception(data["message"])
|
||||
return data
|
||||
|
||||
def _pagination(self, endpoint, params, _empty=()):
|
||||
while True:
|
||||
@ -139,7 +163,8 @@ class RedditAPI():
|
||||
return
|
||||
params["after"] = data["after"]
|
||||
|
||||
def _unfold(self, comments):
|
||||
@staticmethod
|
||||
def _unfold(comments):
|
||||
# TODO: order?
|
||||
queue = comments["data"]["children"]
|
||||
while queue:
|
||||
|
@ -21,7 +21,8 @@ class Job():
|
||||
self.extractor = extractor.find(url)
|
||||
if self.extractor is None:
|
||||
raise exception.NoExtractorError(url)
|
||||
self.extractor.log.debug("Using %s", self.extractor.__class__.__name__)
|
||||
self.extractor.log.debug("Using %s for %s",
|
||||
self.extractor.__class__.__name__, url)
|
||||
|
||||
items = config.get(("images",))
|
||||
if items:
|
||||
|
Loading…
Reference in New Issue
Block a user