[szurubooru] add 'tag' and 'post' extractors (#3583, #3713)

2024-11-22 10:42:34 +01:00 · 2023-03-01 18:20:37 +01:00 · 2023-03-01 18:20:37 +01:00 · 0d142e403c
commit 0d142e403c
parent 075c965512
5 changed files with 173 additions and 0 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -2647,6 +2647,17 @@ Description
    Download video files.


+extractor.[szurubooru].username & .token
+----------------------------------------
+Type
+    ``string``
+Description
+    Username and login token of your account to access private resources.
+
+    To generate a token, visit ``/user/USERNAME/list-tokens``
+    and click ``Create Token``.
+
+
 extractor.tumblr.avatar
 -----------------------
 Type
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -1232,6 +1232,22 @@ Consider all sites to be NSFW unless otherwise known.
    <td></td>
 </tr>

+<tr>
+    <td colspan="4"><strong>szurubooru Instances</strong></td>
+</tr>
+<tr>
+    <td>Foalcon</td>
+    <td>https://booru.foalcon.com/</td>
+    <td>Posts, Tag Searches</td>
+    <td></td>
+</tr>
+<tr>
+    <td>Bcbnsfw</td>
+    <td>https://booru.bcbnsfw.space/</td>
+    <td>Posts, Tag Searches</td>
+    <td></td>
+</tr>
+
 <tr>
    <td colspan="4"><strong>vichan Imageboards</strong></td>
 </tr>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -140,6 +140,7 @@ modules = [
    "soundgasm",
    "speakerdeck",
    "subscribestar",
+    "szurubooru",
    "tapas",
    "tcbscans",
    "telegraph",
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for szurubooru instances"""
+
+from . import booru
+from .. import text
+
+import collections
+import binascii
+
+
+class SzurubooruExtractor(booru.BooruExtractor):
+    basecategory = "szurubooru"
+    filename_fmt = "{id}_{version}_{checksumMD5}.{extension}"
+    per_page = 100
+
+    def __init__(self, match):
+        booru.BooruExtractor.__init__(self, match)
+        self.headers = {
+            "Accept": "application/json",
+            "Content-Type": "application/json",
+        }
+
+        username = self.config("username")
+        if username:
+            token = self.config("token")
+            if token:
+                value = username + ":" + token
+                self.headers["Authorization"] = "Token " + \
+                    binascii.b2a_base64(value.encode())[:-1].decode()
+
+    def _api_request(self, endpoint, params=None):
+        url = self.root + "/api" + endpoint
+        return self.request(url, headers=self.headers, params=params).json()
+
+    def _pagination(self, endpoint, params):
+        params["offset"] = 0
+        params["limit"] = self.per_page
+
+        while True:
+            data = self._api_request(endpoint, params)
+            results = data["results"]
+
+            yield from results
+
+            if len(results) < self.per_page:
+                return
+            params["offset"] += len(results)
+
+    def _file_url(self, post):
+        url = post["contentUrl"]
+        if not url.startswith("http"):
+            url = self.root + "/" + url
+        return url
+
+    @staticmethod
+    def _prepare(post):
+        post["date"] = text.parse_datetime(
+            post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+        tags = []
+        append = tags.append
+        tags_categories = collections.defaultdict(list)
+
+        for tag in post["tags"]:
+            tag_type = tag["category"].rpartition("_")[2]
+            tag_name = tag["names"][0]
+            tags_categories[tag_type].append(tag_name)
+            append(tag_name)
+
+        post["tags"] = tags
+        for category, tags in tags_categories.items():
+            post["tags_" + category] = tags
+
+
+BASE_PATTERN = SzurubooruExtractor.update({
+    "foalcon": {
+        "root": "https://booru.foalcon.com",
+        "pattern": r"booru\.foalcon\.com",
+    },
+    "bcbnsfw": {
+        "root": "https://booru.bcbnsfw.space",
+        "pattern": r"booru\.bcbnsfw\.space",
+    },
+})
+
+
+class SzurubooruTagExtractor(SzurubooruExtractor):
+    subcategory = "tag"
+    directory_fmt = ("{category}", "{search_tags}")
+    archive_fmt = "t_{search_tags}_{id}_{version}"
+    pattern = BASE_PATTERN + r"/posts/query=([^/?#]+)"
+    test = (
+        ("https://booru.foalcon.com/posts/query=simple_background", {
+            "pattern": r"https://booru\.foalcon\.com/data/posts"
+                       r"/\d+_[0-9a-f]{16}\.\w+",
+            "range": "1-150",
+            "count": 150,
+        }),
+        ("https://booru.bcbnsfw.space/posts/query=simple_background"),
+    )
+
+    def __init__(self, match):
+        SzurubooruExtractor.__init__(self, match)
+        query = match.group(match.lastindex)
+        self.query = text.unquote(query.replace("+", " "))
+
+    def metadata(self):
+        return {"search_tags": self.query}
+
+    def posts(self):
+        return self._pagination("/posts/", {"query": self.query})
+
+
+class SzurubooruPostExtractor(SzurubooruExtractor):
+    subcategory = "post"
+    archive_fmt = "{id}_{version}"
+    pattern = BASE_PATTERN + r"/post/(\d+)"
+    test = (
+        ("https://booru.foalcon.com/post/30092", {
+            "pattern": r"https://booru\.foalcon\.com/data/posts"
+                       r"/30092_b7d56e941888b624\.png",
+            "url": "dad4d4c67d87cd9a4ac429b3414747c27a95d5cb",
+            "content": "86d1514c0ca8197950cc4b74e7a59b2dc76ebf9c",
+        }),
+        ("https://booru.bcbnsfw.space/post/1599", {
+            "pattern": r"https://booru\.bcbnsfw\.space/data/posts"
+                       r"/1599_53784518e92086bd\.png",
+            "content": "0c38fc612ba1f03950fad31c4f80a1fccdab1096",
+        }),
+    )
+
+    def __init__(self, match):
+        SzurubooruExtractor.__init__(self, match)
+        self.post_id = match.group(match.lastindex)
+
+    def posts(self):
+        return (self._api_request("/post/" + self.post_id),)
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -264,6 +264,7 @@ BASE_MAP = {
    "lolisafe"    : "lolisafe and chibisafe",
    "lynxchan"    : "LynxChan Imageboards",
    "moebooru"    : "Moebooru and MyImouto",
+    "szurubooru"  : "szurubooru Instances",
    "vichan"      : "vichan Imageboards",
 }