From 7c0d2ca07d5415c521fc710a3c5d1c1b95465994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 3 Nov 2024 09:59:25 +0100 Subject: [PATCH] [rule34vault] update - implement 'tags' categorization - don't use 'totalCount' for pagination end - update tests --- gallery_dl/extractor/rule34vault.py | 32 +++++++++--- test/results/rule34vault.py | 79 +++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 23 deletions(-) diff --git a/gallery_dl/extractor/rule34vault.py b/gallery_dl/extractor/rule34vault.py index de2c0bf6..8c8abfa1 100644 --- a/gallery_dl/extractor/rule34vault.py +++ b/gallery_dl/extractor/rule34vault.py @@ -8,6 +8,7 @@ from .booru import BooruExtractor from .. import text +import collections BASE_PATTERN = r"(?:https?://)?rule34vault\.com" @@ -19,11 +20,19 @@ class Rule34vaultExtractor(BooruExtractor): filename_fmt = "{category}_{id}.{extension}" per_page = 100 + TAG_TYPES = { + 1: "general", + 2: "copyright", + 4: "character", + 8: "artist", + } + def _file_url(self, post): post_id = post["id"] extension = "jpg" if post["type"] == 0 else "mp4" - return "{}/posts/{}/{}/{}.{}".format( + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( self.root_cdn, post_id // 1000, post_id, post_id, extension) + return url def _prepare(self, post): post.pop("files", None) @@ -36,6 +45,13 @@ class Rule34vaultExtractor(BooruExtractor): if "tags" not in post: post.update(self._fetch_post(post["id"])) + tags = collections.defaultdict(list) + for tag in post["tags"]: + tags[tag["type"]].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + def _fetch_post(self, post_id): url = "{}/api/v2/post/{}".format(self.root, post_id) return self.request(url).json() @@ -45,19 +61,19 @@ class Rule34vaultExtractor(BooruExtractor): if params is None: params = {} - params["CountTotal"] = True + params["CountTotal"] = False params["Skip"] = self.page_start * self.per_page params["take"] = self.per_page + threshold = self.per_page while True: data = self.request(url, method="POST", json=params).json() yield from data["items"] - if params["Skip"] + params["take"] > data["totalCount"]: + if len(data["items"]) < threshold: return - if "cursor" in data: - params["cursor"] = data["cursor"] + params["cursor"] = data.get("cursor") params["Skip"] += params["take"] @@ -65,7 +81,7 @@ class Rule34vaultPostExtractor(Rule34vaultExtractor): subcategory = "post" archive_fmt = "{id}" pattern = BASE_PATTERN + r"/post/(\d+)" - example = "https://rule34vault.com/post/399437" + example = "https://rule34vault.com/post/12345" def posts(self): return (self._fetch_post(self.groups[0]),) @@ -76,7 +92,7 @@ class Rule34vaultPlaylistExtractor(Rule34vaultExtractor): directory_fmt = ("{category}", "{playlist_id}") archive_fmt = "p_{playlist_id}_{id}" pattern = BASE_PATTERN + r"/playlists/view/(\d+)" - example = "https://rule34vault.com/playlists/view/2" + example = "https://rule34vault.com/playlists/view/12345" def metadata(self): return {"playlist_id": self.groups[0]} @@ -90,7 +106,7 @@ class Rule34vaultTagExtractor(Rule34vaultExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/([^/?#]+)$" + pattern = BASE_PATTERN + r"/(?!p(?:ost|laylists)/)([^/?#]+)" example = "https://rule34vault.com/TAG" def metadata(self): diff --git a/test/results/rule34vault.py b/test/results/rule34vault.py index 2fe31a1f..425ef7ed 100644 --- a/test/results/rule34vault.py +++ b/test/results/rule34vault.py @@ -16,21 +16,6 @@ __tests__ = ( "#count" : 10, }, -{ - "#url" : "https://rule34vault.com/post/486545", - "#class": rule34vault.Rule34vaultPostExtractor, - "#pattern" : r"https://r34xyz\.b-cdn.net/posts/486/486545/486545\.jpg", - "#sha1_content": "8f53c4c9d049842d23b51fb3cf8ce11bcbe21f07", -}, - -{ - "#url" : "https://rule34vault.com/post/382937", - "#comment": "video", - "#class" : rule34vault.Rule34vaultPostExtractor, - "#pattern" : r"https://r34xyz\.b-cdn.net/posts/382/382937/382937\.mp4", - "#sha1_content": "b962e3e2304139767c3792508353e6e83a85a2af", -}, - { "#url" : "https://rule34vault.com/playlists/view/20164", "#class": rule34vault.Rule34vaultPlaylistExtractor, @@ -38,4 +23,68 @@ __tests__ = ( "#count" : 55, }, +{ + "#url" : "https://rule34vault.com/post/280517", + "#comment": "image", + "#class" : rule34vault.Rule34vaultPostExtractor, + "#options": {"tags": True}, + "#pattern" : "https://r34xyz.b-cdn.net/posts/280/280517/280517.jpg", + "#sha1_content": "1e19d601b4a79c06e6f885a83a5003e7e2a17057", + + "created" : "2023-09-01T11:57:57.317331Z", + "date" : "dt:2023-09-01 11:57:57", + "extension" : "jpg", + "file_url" : "https://r34xyz.b-cdn.net/posts/280/280517/280517.jpg", + "filename" : "280517", + "height" : 1152, + "id" : 280517, + "likes" : range(3, 100), + "posted" : "2023-09-01T12:01:41.008547Z", + "status" : 2, + "type" : 0, + "uploaderId": 20678, + "views" : range(90, 999), + "width" : 768, + "data": { + "sources": [ + "https://trynectar.ai/view/87c98fc8-e4f3-447c-a0d3-024b1890580a", + ], + }, + "tags": [ + "ai generated", + "demon slayer", + "kamado nezuko", + "school uniform", + "sfw", + ], + "tags_character": [ + "kamado nezuko", + ], + "tags_copyright": [ + "demon slayer", + ], + "tags_general": [ + "ai generated", + "school uniform", + "sfw", + ], + "uploader": { + "created" : "2023-07-24T04:33:36.734495Z", + "data" : None, + "displayName" : "quick1e", + "emailVerified": False, + "id" : 20678, + "role" : 1, + "userName" : "quick1e", + }, +}, + +{ + "#url" : "https://rule34vault.com/post/382937", + "#comment": "video", + "#class" : rule34vault.Rule34vaultPostExtractor, + "#urls" : "https://r34xyz.b-cdn.net/posts/382/382937/382937.mp4", + "#sha1_content": "b962e3e2304139767c3792508353e6e83a85a2af", +}, + )