[gelbooru] re-enable API use (closes #56)

Gelbooru's API allows access to all images and is not restricted to the first 20000. This also adds an option to select between API use and manual information extraction in case their API gets disabled again.
2024-11-22 02:32:33 +01:00 · 2017-12-21 21:42:40 +01:00 · 2017-12-21 21:42:40 +01:00 · d0886f411e
commit d0886f411e
parent 8102aae311
3 changed files with 69 additions and 15 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -441,6 +441,18 @@ Description Sets the maximum allowed size for downloaded images.
 =========== =====


+extractor.gelbooru.api
+----------------------
+=========== =====
+Type        ``bool``
+Default     ``true``
+Description Enable use of Gelbooru's API.
+
+            Set this value to `false` if the API has been disabled to switch
+            to manual information extraction.
+=========== =====
+
+
 extractor.gfycat.format
 -----------------------
 =========== =====
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -70,7 +70,8 @@
        },
        "gelbooru":
        {
-            "filename": "{category}_{id:>07}_{md5}.{extension}"
+            "filename": "{category}_{id:>07}_{md5}.{extension}",
+            "api": true
        },
        "reddit":
        {
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -9,7 +9,8 @@
 """Extract images from https://gelbooru.com/"""

 from .common import SharedConfigExtractor, Message
-from .. import text, util
+from .. import text, util, exception
+import xml.etree.ElementTree as ET


 class GelbooruExtractor(SharedConfigExtractor):
@ -17,19 +18,26 @@ class GelbooruExtractor(SharedConfigExtractor):
    basecategory = "booru"
    category = "gelbooru"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
+    api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"

    def __init__(self):
        SharedConfigExtractor.__init__(self)
        self.start_post = 0
+        self.use_api = self.config("api", True)
+        if self.use_api:
+            self.get_post_data = self.get_post_data_api

    def items(self):
        yield Message.Version, 1
        yield Message.Directory, self.get_metadata()

-        for post_id in util.advance(self.get_posts(), self.start_post):
-            data = self.get_post_data(post_id)
-            url = data["file_url"]
-            yield Message.Url, url, text.nameext_from_url(url, data)
+        for post in util.advance(self.get_posts(), self.start_post):
+            if isinstance(post, str):
+                post = self.get_post_data(post)
+            for key in ("id", "width", "height", "score", "change"):
+                post[key] = util.safe_int(post[key])
+            url = post["file_url"]
+            yield Message.Url, url, text.nameext_from_url(url, post)

    def skip(self, num):
        self.start_post += num
@ -40,7 +48,7 @@ class GelbooruExtractor(SharedConfigExtractor):
        return {}

    def get_posts(self):
-        """Return an iterable containing all relevant post ids"""
+        """Return an iterable containing all relevant post objects"""

    def get_post_data(self, post_id):
        """Extract metadata of a single post"""
@ -58,14 +66,20 @@ class GelbooruExtractor(SharedConfigExtractor):
            (None        , '<li>Score: ', ''),
            ("score"     , '>', '<'),
            ("file_url"  , '<li><a href="http', '"'),
+            ("change"    , ' id="lupdated" value="', '"'),
        ))[0]
-        data["file_url"] = "http" + data["file_url"]
+        data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
        data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
        data["rating"] = (data["rating"] or "?")[0].lower()
-        for key in ("id", "width", "height", "score"):
-            data[key] = util.safe_int(data[key])
+        data["tags"] = " ".join(
+            [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
        return data

+    def get_post_data_api(self, post_id):
+        """Request metadata of a single post from Gelbooru's API"""
+        return ET.fromstring(
+            self.request(self.api_url + "&id=" + post_id).text)[0].attrib
+

 class GelbooruTagExtractor(GelbooruExtractor):
    """Extractor for images from gelbooru.com based on search-tags"""
@ -73,14 +87,20 @@ class GelbooruTagExtractor(GelbooruExtractor):
    directory_fmt = ["{category}", "{tags}"]
    pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
               r"\?page=post&s=list&tags=([^&]+)"]
-    test = [("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
-        "count": 5,
-    })]
-    per_page = 42
+    test = [
+        ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
+            "count": 5,
+        }),
+        ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
+            "options": (("api", False),),
+            "count": 5,
+        }),
+    ]

    def __init__(self, match):
        GelbooruExtractor.__init__(self)
        self.tags = text.unquote(match.group(1).replace("+", " "))
+        self.per_page = 100 if self.use_api else 42
        self.start_page = 0

    def skip(self, num):
@ -93,8 +113,26 @@ class GelbooruTagExtractor(GelbooruExtractor):
        return {"tags": self.tags}

    def get_posts(self):
+        if self.use_api:
+            return self._get_posts_api()
+        return self._get_posts_manual()
+
+    def _get_posts_api(self):
+        params = {
+            # 'pid' is page-id; first page has index 0
+            "tags": self.tags, "limit": self.per_page, "pid": self.start_page}
+        while True:
+            root = ET.fromstring(
+                self.request(self.api_url, params=params).text)
+            for item in root:
+                yield item.attrib
+            if len(root) < self.per_page:
+                return
+            params["pid"] += 1
+
+    def _get_posts_manual(self):
        url = "https://gelbooru.com/index.php?page=post&s=list"
-        # values for 'pid' must be multiples of 42
+        # 'pid' is post-id; values for 'pid' must be multiples of 42
        params = {"tags": self.tags, "pid": self.start_page * self.per_page}

        while True:
@ -127,6 +165,9 @@ class GelbooruPoolExtractor(GelbooruExtractor):
        name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
        self.posts = list(text.extract_iter(page, 'id="p', '"', pos))

+        if not name:
+            raise exception.NotFoundError("pool")
+
        return {
            "pool": util.safe_int(self.pool_id),
            "pool_name": text.unescape(name),