[reddit] enable recursion (#15)

reddit extractors now recursively visit other submissions/posts linked to in the initial set of submissions. This behaviour can be configured via the 'extractor.reddit.recursion' key in the configuration file or by `-o recursion=<value>`. Example: {"extractor": { "reddit": { "recursion": <value> }}} Possible values: * -1 - infinite recursion (don't do this) * 0 - recursion is disabled (default) * 1 and higher - maximum recursion level
2024-11-22 18:53:21 +01:00 · 2017-05-26 16:40:08 +02:00 · 2017-05-26 16:40:08 +02:00 · 99b72130ee
commit 99b72130ee
parent ae686c4c08
3 changed files with 61 additions and 35 deletions
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@ -39,8 +39,8 @@ class MangaparkChapterExtractor(Extractor):
                r"([^/]+/s(\d+)(?:/v([^/]+))?/c(\d+)(?:([^/]+)|/e(\d+))?)")]
    test = [
        ("http://mangapark.me/manga/gosu/s2/c55", {
-            "url": "482d4a27c1e7f03cff8afac145d06f3ddeac82bb",
-            "keyword": "bd97ca24ef344b44292910384215ef3f1005ea2e",
+            "url": "fefe84492d9118de5962563fbecb9362051c52d5",
+            "keyword": "652b38c40bdfb5592456b6e7524a3acfdef9fae6",
        }),
        (("http://mangapark.me/manga/"
          "ad-astra-per-aspera-hata-kenjirou/s1/c1.2"), {
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@ -9,7 +9,7 @@
 """Extract images subreddits at https://reddit.com/"""

 from .common import Extractor, Message
-from .. import text, exception
+from .. import text, extractor, exception
 from ..cache import cache
 import re

@ -21,36 +21,54 @@ class RedditExtractor(Extractor):
    def __init__(self):
        Extractor.__init__(self)
        self.api = RedditAPI(self.session, self.log)
+        self.max_depth = int(self.config("recursion", 0))
+        self._visited = set()

    def items(self):
-        regex = re.compile(r"https?://(?:[^.]+\.)?reddit.com/")
-        yield Message.Version, 1
-        for submission, comments in self.submissions():
-            urls = [submission["url"]]
-            urls.extend(
-                text.extract_iter(
-                    " ".join(self._collect(submission, comments)),
-                    ' href="', '"'
-                )
-            )
-            for url in urls:
-                if url[0] == "#":
-                    continue
-                elif url[0] == "/":
-                    url = "nofollow:https://www.reddit.com" + url
-                elif regex.match(url):
-                    url = "nofollow:" + url
-                yield Message.Queue, url
+        subre = re.compile(RedditSubmissionExtractor.pattern[0])
+        submissions = self.submissions()
+        depth = 0

-    def _collect(self, submission, comments):
-        yield submission["selftext_html"] or ""
-        for comment in comments:
-            yield comment["body_html"] or ""
+        yield Message.Version, 1
+        with extractor.blacklist("reddit"):
+            while True:
+                extra = []
+                for url in self._urls(submissions):
+                    if url[0] == "#":
+                        continue
+                    if url[0] == "/":
+                        url = "https://www.reddit.com" + url
+
+                    match = subre.match(url)
+                    if match:
+                        extra.append(match.group(1))
+                    else:
+                        yield Message.Queue, url
+
+                if not extra or depth == self.max_depth:
+                    return
+                depth += 1
+                submissions = (
+                    self.api.submission(sid) for sid in extra
+                    if sid not in self._visited
+                )
+
+    def submissions(self):
+        """Return an iterable containing all (submission, comments) tuples"""
+
+    def _urls(self, submissions):
+        for submission, comments in submissions:
+            self._visited.add(submission["id"])
+            if not submission["is_self"]:
+                yield submission["url"]
+            strings = [submission["selftext_html"] or ""]
+            strings += [c["body_html"] or "" for c in comments]
+            yield from text.extract_iter("".join(strings), ' href="', '"')


 class RedditSubredditExtractor(RedditExtractor):
    """Extractor for images from subreddits on reddit.com"""
-    subcategory = "submission"
+    subcategory = "subreddit"
    pattern = [r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/([^/]+)/?$"]

    def __init__(self, match):
@ -63,10 +81,11 @@ class RedditSubredditExtractor(RedditExtractor):

 class RedditSubmissionExtractor(RedditExtractor):
    """Extractor for images from a submission on reddit.com"""
-    subcategory = "subreddit"
-    pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
-                r"/comments/([a-z0-9]+)"),
-               (r"(?:https?://)?redd\.it/([a-z0-9]+)")]
+    subcategory = "submission"
+    pattern = [(r"(?:https?://)?(?:"
+                r"(?:m\.|www\.)?reddit\.com/r/[^/]+/comments|"
+                r"redd\.it"
+                r")/([a-z0-9]+)")]

    def __init__(self, match):
        RedditExtractor.__init__(self)
@ -119,10 +138,15 @@ class RedditAPI():

    def _call(self, endpoint, params):
        url = "https://oauth.reddit.com" + endpoint
-        # TODO: handle errors / rate limits
        self.authenticate()
-        response = self.session.get(url, params=params)
-        return response.json()
+        data = self.session.get(url, params=params).json()
+        if "error" in data:
+            if data["error"] == 403:
+                raise exception.AuthorizationError()
+            if data["error"] == 404:
+                raise exception.NotFoundError()
+            raise Exception(data["message"])
+        return data

    def _pagination(self, endpoint, params, _empty=()):
        while True:
@ -139,7 +163,8 @@ class RedditAPI():
                return
            params["after"] = data["after"]

-    def _unfold(self, comments):
+    @staticmethod
+    def _unfold(comments):
        # TODO: order?
        queue = comments["data"]["children"]
        while queue:
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@ -21,7 +21,8 @@ class Job():
        self.extractor = extractor.find(url)
        if self.extractor is None:
            raise exception.NoExtractorError(url)
-        self.extractor.log.debug("Using %s", self.extractor.__class__.__name__)
+        self.extractor.log.debug("Using %s for %s",
+            self.extractor.__class__.__name__, url)

        items = config.get(("images",))
        if items: