[e621] implement 'notes' and 'pools' metadata extraction

(#3425)
2024-11-22 02:32:33 +01:00 · 2023-02-04 21:20:38 +01:00 · 2023-02-04 21:20:38 +01:00 · bbf0911a46
commit bbf0911a46
parent 925b467496
3 changed files with 124 additions and 31 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -1103,8 +1103,21 @@ Description
    follow the ``source`` and download from there if possible.


-extractor.danbooru.metadata
---------------------------
+extractor.danbooru.ugoira
+-------------------------
+Type
+    ``bool``
+Default
+    ``false``
+Description
+    Controls the download target for Ugoira posts.
+
+    * ``true``: Original ZIP archives
+    * ``false``: Converted video files
+
+
+extractor.[Danbooru].metadata
+-----------------------------
 Type
    * ``bool``
    * ``string``
@ -1125,8 +1138,8 @@ Description
    Note: This requires 1 additional HTTP request per post.


-extractor.danbooru.threshold
----------------------------
+extractor.{Danbooru].threshold
+------------------------------
 Type
    * ``string``
    * ``integer``
@ -1135,27 +1148,13 @@ Default
 Description
    Stop paginating over API results if the length of a batch of returned
    posts is less than the specified number. Defaults to the per-page limit
-    of the current instance, which is 320 for ``e621`` and 200 for
-    everything else.
+    of the current instance, which is 200.

    Note: Changing this setting is normally not necessary. When the value is
    greater than the per-page limit, gallery-dl will stop after the first
    batch. The value cannot be less than 1.


-extractor.danbooru.ugoira
-------------------------
-Type
-    ``bool``
-Default
-    ``false``
-Description
-    Controls the download target for Ugoira posts.
-
-    * ``true``: Original ZIP archives
-    * ``false``: Converted video files
-
-
 extractor.derpibooru.api-key
 ----------------------------
 Type
@ -1388,6 +1387,40 @@ Description
    Minimum wait time in seconds before API requests.


+extractor.[E621].metadata
+-------------------------
+Type
+    * ``bool``
+    * ``string``
+    * ``list`` of ``strings``
+Default
+    ``false``
+Example
+    * ``notes,pools``
+    * ``["notes", "pools"``
+Description
+    Extract additional metadata (notes, pool metadata) if available.
+
+    Note: This requires 0-2 additional HTTP requests per post.
+
+
+extractor.[E621].threshold
+--------------------------
+Type
+    * ``string``
+    * ``integer``
+Default
+    ``"auto"``
+Description
+    Stop paginating over API results if the length of a batch of returned
+    posts is less than the specified number. Defaults to the per-page limit
+    of the current instance, which is 320.
+
+    Note: Changing this setting is normally not necessary. When the value is
+    greater than the per-page limit, gallery-dl will stop after the first
+    batch. The value cannot be less than 1.
+
+
 extractor.exhentai.domain
 -------------------------
 Type
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@ -26,16 +26,6 @@ class DanbooruExtractor(BaseExtractor):
        self.ugoira = self.config("ugoira", False)
        self.external = self.config("external", False)

-        metadata = self.config("metadata", False)
-        if metadata:
-            if isinstance(metadata, (list, tuple)):
-                metadata = ",".join(metadata)
-            elif not isinstance(metadata, str):
-                metadata = "artist_commentary,children,notes,parent,uploader"
-            self.metadata_includes = metadata
-        else:
-            self.metadata_includes = None
-
        threshold = self.config("threshold")
        if isinstance(threshold, int):
            self.threshold = 1 if threshold < 1 else threshold
@ -55,6 +45,13 @@ class DanbooruExtractor(BaseExtractor):
        return pages * self.per_page

    def items(self):
+        includes = self.config("metadata")
+        if includes:
+            if isinstance(includes, (list, tuple)):
+                includes = ",".join(includes)
+            elif not isinstance(includes, str):
+                includes = "artist_commentary,children,notes,parent,uploader"
+
        data = self.metadata()
        for post in self.posts():

@ -77,9 +74,9 @@ class DanbooruExtractor(BaseExtractor):
                    url = post["large_file_url"]
                    post["extension"] = "webm"

-            if self.metadata_includes:
+            if includes:
                meta_url = "{}/posts/{}.json?only={}".format(
-                    self.root, post["id"], self.metadata_includes)
+                    self.root, post["id"], includes)
                post.update(self.request(meta_url).json())

            if url[0] == "/":
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@ -29,6 +29,16 @@ class E621Extractor(danbooru.DanbooruExtractor):
        self.headers = {"User-Agent": "gallery-dl/{} (by mikf)".format(
            version.__version__)}

+        includes = self.config("metadata") or ()
+        if includes:
+            if isinstance(includes, str):
+                includes = includes.split(",")
+            elif not isinstance(includes, (list, tuple)):
+                includes = ("notes", "pools")
+
+        notes = ("notes" in includes)
+        pools = ("pools" in includes)
+
        data = self.metadata()
        for post in self.posts():
            file = post["file"]
@ -38,6 +48,18 @@ class E621Extractor(danbooru.DanbooruExtractor):
                file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format(
                    self.root[8:], md5[0:2], md5[2:4], md5, file["ext"])

+            if notes and post.get("has_notes"):
+                url = "{}/notes.json?search[post_id]={}".format(
+                    self.root, post["id"])
+                post["notes"] = self.request(url).json()
+
+            if pools and post["pools"]:
+                url = "{}/pools.json?search[id]={}".format(
+                    self.root, ",".join(map(str, post["pools"])))
+                post["pools"] = _pools = self.request(url).json()
+                for pool in _pools:
+                    pool["name"] = pool["name"].replace("_", " ")
+
            post["filename"] = file["md5"]
            post["extension"] = file["ext"]

@ -124,6 +146,47 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor):
            "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
            "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
        }),
+        ("https://e621.net/posts/3181052", {
+            "options": (("metadata", "notes,pools"),),
+            "pattern": r"https://static\d\.e621\.net/data/c6/8c"
+                       r"/c68cca0643890b615f75fb2719589bff\.png",
+            "keyword": {
+                "notes": [
+                    {
+                        "body": "Little Legends 2",
+                        "created_at": "2022-05-16T13:58:38.877-04:00",
+                        "creator_id": 517450,
+                        "creator_name": "EeveeCuddler69",
+                        "height": 475,
+                        "id": 321296,
+                        "is_active": True,
+                        "post_id": 3181052,
+                        "updated_at": "2022-05-16T13:59:02.050-04:00",
+                        "version": 3,
+                        "width": 809,
+                        "x": 83,
+                        "y": 117,
+                    },
+                ],
+                "pools": [
+                    {
+                        "category": "series",
+                        "created_at": "2022-02-17T00:29:22.669-05:00",
+                        "creator_id": 1077440,
+                        "creator_name": "Yeetus90",
+                        "description": "* \"Little Legends\":/pools/27971\r\n"
+                                       "* Little Legends 2\r\n"
+                                       "* \"Little Legends 3\":/pools/27481",
+                        "id": 27492,
+                        "is_active": False,
+                        "name": "Little Legends 2",
+                        "post_count": 39,
+                        "post_ids": list,
+                        "updated_at": "2022-03-27T06:30:03.382-04:00"
+                    },
+                ],
+            },
+        }),
        ("https://e621.net/post/show/535"),

        ("https://e926.net/posts/535", {