[realbooru] fix extraction

get file URLs from HTML pages
2024-11-22 18:53:21 +01:00 · 2023-04-02 20:45:16 +02:00 · 2023-04-02 20:45:16 +02:00 · ac97aca99c
commit ac97aca99c
parent 75666cf9c3
1 changed files with 24 additions and 2 deletions
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@ -30,7 +30,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
            self.api_root = self.root

        if self.category == "realbooru":
-            self._file_url = self._file_url_realbooru
+            self.items = self._items_realbooru
            self._tags = self._tags_realbooru

    def _api_request(self, params):
@ -129,6 +129,28 @@ class GelbooruV02Extractor(booru.BooruExtractor):
                self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
        return url

+    def _items_realbooru(self):
+        from .common import Message
+        data = self.metadata()
+
+        for post in self.posts():
+            try:
+                html = self._html(post)
+                url = post["file_url"] = text.rextract(
+                    html, 'href="', '"', html.index(">Original<"))[0]
+            except Exception:
+                self.log.debug("Unable to fetch download URL for post %s "
+                               "(md5: %s)", post.get("id"), post.get("md5"))
+                continue
+
+            text.nameext_from_url(url, post)
+            post.update(data)
+            self._prepare(post)
+            self._tags(post, html)
+
+            yield Message.Directory, post
+            yield Message.Url, url, post
+
    def _tags_realbooru(self, post, page):
        tag_container = text.extr(page, 'id="tagLink"', '</div>')
        tags = collections.defaultdict(list)
@ -404,7 +426,7 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
            },
        }),
        ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
-            "pattern": r"https://realbooru\.com/images/dc/b5"
+            "pattern": r"https://realbooru\.com//?images/dc/b5"
                       r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg",
            "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
            "options": (("tags", True),),