[newgrounds] rewrite (#394)

- restructure extractor hierarchy - extract more metadata - extract videos without youtube-dl - be more resilient to errors TODO: - favorites - games, but that might be near impossible for non-flash titles
2024-11-26 12:42:29 +01:00 · 2019-11-16 23:51:02 +01:00 · 2019-11-16 23:51:02 +01:00 · b1f0609de5
commit b1f0609de5
parent 3ece3976ae
1 changed files with 239 additions and 78 deletions
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@ -17,7 +17,7 @@ import json
 class NewgroundsExtractor(Extractor):
    """Base class for newgrounds extractors"""
    category = "newgrounds"
-    directory_fmt = ("{category}", "{user}")
+    directory_fmt = ("{category}", "{artist[:10]:J, }")
    filename_fmt = "{category}_{index}_{title}.{extension}"
    archive_fmt = "{index}"
    root = "https://www.newgrounds.com"
@ -31,45 +31,24 @@ class NewgroundsExtractor(Extractor):

    def items(self):
        self.login()
-        data = self.metadata()
        yield Message.Version, 1

        for post_url in self.posts():
-            file = self.extract_post_data(post_url)
-            file.update(data)
-            url = file["url"]
+            try:
+                file = self.extract_post(post_url)
+                url = file["url"]
+            #  except Exception:
+            except OSError:
+                url = None
+            if not url:
+                self.log.warning("Unable to get download URL for %s", post_url)
+                continue
            yield Message.Directory, file
            yield Message.Url, url, text.nameext_from_url(url, file)

-    def metadata(self):
-        """Collect metadata for extractor-job"""
-        return {"user": self.user}
-
    def posts(self):
        """Return urls of all relevant image pages"""
-
-    def extract_post_data(self, post_url):
-        """Collect url and metadata from an image post"""
-        extr = text.extract_from(self.request(post_url).text)
-        full = text.extract_from(json.loads(extr('"full_image_text":', '});')))
-        data = {
-            "title"      : text.unescape(extr('"og:title" content="', '"')),
-            "description": text.unescape(extr(':description" content="', '"')),
-            "date"       : text.parse_datetime(extr(
-                'itemprop="datePublished" content="', '"')),
-            "rating"     : extr('class="rated-', '"'),
-            "favorites"  : text.parse_int(extr('id="faves_load">', '<')),
-            "score"      : text.parse_float(extr('id="score_number">', '<')),
-            "tags"       : text.split_html(extr(
-                '<dd class="tags momag">', '</dd>')),
-            "url"        : full('src="', '"'),
-            "width"      : text.parse_int(full('width="', '"')),
-            "height"     : text.parse_int(full('height="', '"')),
-        }
-        data["tags"].sort()
-        data["index"] = text.parse_int(
-            data["url"].rpartition("/")[2].partition("_")[0])
-        return data
+        return self._pagination(self.subcategory)

    def login(self):
        username, password = self._get_auth_info()
@ -102,40 +81,104 @@ class NewgroundsExtractor(Extractor):
            if cookie.expires and cookie.domain == self.cookiedomain
        }

-    def _pagination(self, url):
-        headers = {
-            "Referer": self.root,
-            "X-Requested-With": "XMLHttpRequest",
-            "Accept": "application/json, text/javascript, */*; q=0.01",
+    def extract_post(self, post_url):
+        page = self.request(post_url).text
+        extr = text.extract_from(page)
+
+        if "/art/view/" in post_url:
+            data = self._extract_image_data(extr, post_url)
+        elif "/audio/listen/" in post_url:
+            data = self._extract_audio_data(extr, post_url)
+        else:
+            data = self._extract_media_data(extr, post_url)
+
+        data["comment"] = text.unescape(text.remove_html(extr(
+            'id="author_comments">', '</div>'), "", ""))
+        data["favorites"] = text.parse_int(extr(
+            'id="faves_load">', '<').replace(",", ""))
+        data["score"] = text.parse_float(extr('id="score_number">', '<'))
+        data["tags"] = text.split_html(extr(
+            '<dd class="tags momag">', '</dd>'))
+        data["artist"] = [
+            text.extract(user, '//', '.')[0]
+            for user in text.extract_iter(page, '<div class="item-user">', '>')
+        ]
+
+        data["tags"].sort()
+        data["user"] = self.user or data["artist"][0]
+        return data
+
+    @staticmethod
+    def _extract_image_data(extr, url):
+        full = text.extract_from(json.loads(extr('"full_image_text":', '});')))
+        data = {
+            "title"      : text.unescape(extr('"og:title" content="', '"')),
+            "description": text.unescape(extr(':description" content="', '"')),
+            "date"       : text.parse_datetime(extr(
+                'itemprop="datePublished" content="', '"')),
+            "rating"     : extr('class="rated-', '"'),
+            "url"        : full('src="', '"'),
+            "width"      : text.parse_int(full('width="', '"')),
+            "height"     : text.parse_int(full('height="', '"')),
+        }
+        data["index"] = text.parse_int(
+            data["url"].rpartition("/")[2].partition("_")[0])
+        return data
+
+    @staticmethod
+    def _extract_audio_data(extr, url):
+        return {
+            "title"      : text.unescape(extr('"og:title" content="', '"')),
+            "description": text.unescape(extr(':description" content="', '"')),
+            "date"       : text.parse_datetime(extr(
+                'itemprop="datePublished" content="', '"')),
+            "url"        : extr('{"url":"', '"').replace("\\/", "/"),
+            "index"      : text.parse_int(url.split("/")[5]),
+            "rating"     : "",
        }

+    @staticmethod
+    def _extract_media_data(extr, url):
+        return {
+            "title"      : text.unescape(extr('"og:title" content="', '"')),
+            "url"        : extr('{"url":"', '"').replace("\\/", "/"),
+            "date"       : text.parse_datetime(extr(
+                'itemprop="datePublished" content="', '"')),
+            "description": text.unescape(extr(
+                'itemprop="description" content="', '"')),
+            "rating"     : extr('class="rated-', '"'),
+            "index"      : text.parse_int(url.split("/")[5]),
+        }
+
+    def _pagination(self, kind):
+        root = self.user_root
+        headers = {
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "X-Requested-With": "XMLHttpRequest",
+            "Referer": root,
+        }
+        url = "{}/{}/page/1".format(root, kind)
+
        while True:
-            data = self.request(url, headers=headers).json()
+            with self.request(url, headers=headers, fatal=False) as response:
+                try:
+                    data = response.json()
+                except ValueError:
+                    return
+                if not data:
+                    return
+                if "errors" in data:
+                    msg = ", ".join(text.unescape(e) for e in data["errors"])
+                    raise exception.StopExtraction(msg)

            for year in data["sequence"]:
                for item in data["years"][str(year)]["items"]:
                    page_url = text.extract(item, 'href="', '"')[0]
-                    yield text.urljoin(self.root, page_url)
+                    yield text.urljoin(root, page_url)

            if not data["more"]:
                return
-            url = text.urljoin(self.root, data["more"])
-
-
-class NewgroundsUserExtractor(NewgroundsExtractor):
-    """Extractor for all images of a newgrounds user"""
-    subcategory = "user"
-    pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com(?:/art)?/?$"
-    test = (
-        ("https://blitzwuff.newgrounds.com/art", {
-            "url": "24b19c4a135a09889fac7b46a74e427e4308d02b",
-            "keyword": "8330e1e563c8a5464938feaceac88aa9870aefe4",
-        }),
-        ("https://blitzwuff.newgrounds.com/"),
-    )
-
-    def posts(self):
-        return self._pagination(self.root + "/art/page/1")
+            url = text.urljoin(root, data["more"])


 class NewgroundsImageExtractor(NewgroundsExtractor):
@ -145,14 +188,28 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
               r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+"
               r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))")
    test = (
-        ("https://www.newgrounds.com/art/view/blitzwuff/ffx", {
-            "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818",
-            "keyword": "3ffcdc8f54a46b8ee1c9b433627f66b750edff51",
-            "content": "cb067d6593598710292cdd340d350d14a26fe075",
+        ("https://www.newgrounds.com/art/view/tomfulp/ryu-is-hawt", {
+            "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e",
+            "content": "8f395e08333eb2457ba8d8b715238f8910221365",
+            "keyword": {
+                "artist"     : ["tomfulp"],
+                "comment"    : "re:Consider this the bottom threshold for ",
+                "date"       : "type:datetime",
+                "description": "re:Consider this the bottom threshold for ",
+                "favorites"  : int,
+                "filename"   : "94_tomfulp_ryu-is-hawt",
+                "height"     : 476,
+                "index"      : 94,
+                "rating"     : "e",
+                "score"      : float,
+                "tags"       : ["ryu", "streetfighter"],
+                "title"      : "Ryu is Hawt",
+                "user"       : "tomfulp",
+                "width"      : 447,
+            },
        }),
-        ("https://art.ngfiles.com/images/587000/587551_blitzwuff_ffx.png", {
-            "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818",
-            "keyword": "3ffcdc8f54a46b8ee1c9b433627f66b750edff51",
+        ("https://art.ngfiles.com/images/0/94_tomfulp_ryu-is-hawt.gif", {
+            "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e",
        }),
    )

@ -163,27 +220,131 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
            self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format(
                self.user, match.group(3))
        else:
-            self.post_url = match.group(0)
+            url = match.group(0)
+            if not url.startswith("http"):
+                url = "https://" + url
+            self.post_url = url

    def posts(self):
        return (self.post_url,)


-class NewgroundsVideoExtractor(NewgroundsExtractor):
-    """Extractor for all videos of a newgrounds user"""
-    subcategory = "video"
-    filename_fmt = "{category}_{index}.{extension}"
-    pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$"
-    test = ("https://tomfulp.newgrounds.com/movies", {
-        "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+",
-        "count": ">= 32",
-    })
+class NewgroundsMediaExtractor(NewgroundsExtractor):
+    """Extractor for a media file from newgrounds.com"""
+    subcategory = "media"
+    pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com"
+               r"(/(?:portal/view|audio/listen)/\d+)")
+    test = (
+        ("https://www.newgrounds.com/portal/view/589549", {
+            "url": "48d916d819c99139e6a3acbbf659a78a867d363e",
+            "content": "ceb865426727ec887177d99e0d20bb021e8606ae",
+            "keyword": {
+                "artist"     : ["psychogoldfish", "tomfulp"],
+                "comment"    : "re:People have been asking me how I like the ",
+                "date"       : "type:datetime",
+                "description": "re:People have been asking how I like the ",
+                "favorites"  : int,
+                "filename"   : "527818_alternate_1896",
+                "index"      : 589549,
+                "rating"     : "t",
+                "score"      : float,
+                "tags"       : ["newgrounds", "psychogoldfish",
+                                "rage", "redesign-2012"],
+                "title"      : "Redesign Rage",
+                "user"       : "psychogoldfish",
+            },
+        }),
+        ("https://www.newgrounds.com/audio/listen/609768", {
+            "url": "f4c5490ae559a3b05e46821bb7ee834f93a43c95",
+            "keyword": {
+                "artist"     : ["zj", "tomfulp"],
+                "comment"    : "re:RECORDED 12-09-2014\n\nFrom The ZJ \"Late ",
+                "date"       : "type:datetime",
+                "description": "From The ZJ Report Show!",
+                "favorites"  : 18,
+                "index"      : 609768,
+                "rating"     : "",
+                "score"      : float,
+                "tags"       : ["fulp", "interview", "tom", "zj"],
+                "title"      : "ZJ Interviews Tom Fulp!",
+                "user"       : "zj",
+            },
+        }),
+    )
+
+    def __init__(self, match):
+        NewgroundsExtractor.__init__(self, match)
+        self.user = ""
+        self.post_url = self.root + match.group(1)

    def posts(self):
-        return self._pagination(self.root + "/movies/page/1")
+        return (self.post_url,)

-    def extract_post_data(self, page_url):
-        return {
-            "url"  : "ytdl:" + page_url,
-            "index": text.parse_int(page_url.rpartition("/")[2]),
+
+class NewgroundsArtExtractor(NewgroundsExtractor):
+    """Extractor for all images of a newgrounds user"""
+    subcategory = "art"
+    pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/art/?$"
+    test = ("https://tomfulp.newgrounds.com/art", {
+        "pattern": NewgroundsImageExtractor.pattern,
+        "count": ">= 3",
+    })
+
+
+class NewgroundsAudioExtractor(NewgroundsExtractor):
+    """Extractor for all audio submissions of a newgrounds user"""
+    subcategory = "audio"
+    pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/audio/?$"
+    test = ("https://tomfulp.newgrounds.com/audio", {
+        "pattern": r"https://audio.ngfiles.com/\d+/\d+_.+\.mp3",
+        "count": ">= 4",
+    })
+
+
+class NewgroundsMoviesExtractor(NewgroundsExtractor):
+    """Extractor for all movies of a newgrounds user"""
+    subcategory = "movies"
+    pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$"
+    test = ("https://tomfulp.newgrounds.com/movies", {
+        "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+",
+        "range": "1-10",
+        "count": 10,
+    })
+
+
+class NewgroundsUserExtractor(NewgroundsExtractor):
+    """Extractor for a newgrounds user profile"""
+    subcategory = "user"
+    pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/?$"
+    test = (
+        ("https://tomfulp.newgrounds.com", {
+            "pattern": "https://tomfulp.newgrounds.com/art$",
+            "count": 1,
+        }),
+        ("https://tomfulp.newgrounds.com", {
+            "options": (("include", "all"),),
+            "pattern": "https://tomfulp.newgrounds.com/(art|audio|movies)$",
+            "count": 3,
+        }),
+    )
+
+    def items(self):
+        data = {}
+        extr_map = {
+            "art": NewgroundsArtExtractor,
+            "audio": NewgroundsAudioExtractor,
+            "movies": NewgroundsMoviesExtractor,
        }
+
+        include = self.config("include", ("art",)) or ()
+        if include == "all":
+            include = extr_map.keys()
+        elif isinstance(include, str):
+            include = include.split(",")
+
+        yield Message.Version, 1
+        for category in include:
+            if category in extr_map:
+                url = self.user_root + "/" + category
+                data["_extractor"] = extr_map[category]
+                yield Message.Queue, url, data