From c5be50fdaad5209eb193111d8a4caf897ebb28d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 4 Oct 2024 21:07:56 +0200 Subject: [PATCH] [pixiv] implement workaround for 'limit_sanity_level' works (#4327, #4747, #5054, #5435, #5651, #5655) Metadata should be ~95% identical (there might be some 'date' differences) and there could be issues with R-18 works, as these require some URL manipulation to transform /c/250x250_80_a2/ thumbnail URLs into /img-original/ ones. --- docs/configuration.rst | 10 ++++ gallery_dl/extractor/pixiv.py | 98 ++++++++++++++++++++++++++++++++--- test/results/pixiv.py | 79 +++++++++++++++++++++++++++- 3 files changed, 180 insertions(+), 7 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 4b1a7066..fe771518 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3525,6 +3525,16 @@ Description A value of ``0`` means no limit. +extractor.pixiv.sanity +---------------------- +Type + ``bool`` +Default + ``true`` +Description + Try to fetch ``limit_sanity_level`` works via web API. + + extractor.plurk.comments ------------------------ Type diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 0127a650..31d12912 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -27,13 +27,14 @@ class PixivExtractor(Extractor): filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" cookies_domain = None - url_sanity = ("https://s.pximg.net/common/images" + sanity_url = ("https://s.pximg.net/common/images" "/limit_sanity_level_360.png") def _init(self): self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) self.max_posts = self.config("max-posts", 0) + self.sanity_workaround = self.config("sanity", True) def items(self): tags = self.config("tags", "japanese") @@ -102,10 +103,14 @@ class PixivExtractor(Extractor): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] - if url == self.url_sanity: - self.log.warning( - "Unable to download work %s ('sanity_level' warning)", - work["id"]) + if url == self.sanity_url: + if self.sanity_workaround: + self.log.warning("%s: 'sanity_level' warning", work["id"]) + self._extract_ajax(work, files) + else: + self.log.warning( + "%s: Unable to download work ('sanity_level' warning)", + work["id"]) else: files.append({"url": url}) @@ -147,13 +152,93 @@ class PixivExtractor(Extractor): "num": num, "suffix": "_p{:02}".format(num), "_ugoira_frame_index": num, - })) else: files.append({ "url": url.replace("_ugoira600x600", "_ugoira1920x1080", 1), }) + def _extract_ajax(self, work, files): + url = "{}/ajax/illust/{}".format(self.root, work["id"]) + data = self.request(url, headers=self.headers_web).json() + body = data["body"] + + for key_app, key_ajax in ( + ("title" , "illustTitle"), + ("image_urls" , "urls"), + ("caption" , "illustComment"), + ("create_date" , "createDate"), + ("width" , "width"), + ("height" , "height"), + ("sanity_level" , "sl"), + ("total_view" , "viewCount"), + ("total_comments" , "commentCount"), + ("total_bookmarks" , "bookmarkCount"), + ("restrict" , "restrict"), + ("x_restrict" , "xRestrict"), + ("illust_ai_type" , "aiType"), + ("illust_book_style", "bookStyle"), + ): + work[key_app] = body[key_ajax] + + work["user"] = { + "account" : body["userAccount"], + "id" : int(body["userId"]), + "is_followed": False, + "name" : body["userName"], + "profile_image_urls": {}, + } + + work["tags"] = tags = [] + for tag in body["tags"]["tags"]: + name = tag["tag"] + try: + translated_name = tag["translation"]["en"] + except Exception: + translated_name = None + tags.append({"name": name, "translated_name": translated_name}) + + url = self._extract_ajax_url(body) + if not url: + return + + work["page_count"] = count = body["pageCount"] + if count == 1: + files.append({"url": url}) + else: + base, _, ext = url.rpartition("_p0.") + for num in range(count): + url = "{}_p{}.{}".format(base, num, ext) + files.append({ + "url" : url, + "suffix": "_p{:02}".format(num), + }) + + def _extract_ajax_url(self, body): + try: + original = body["urls"]["original"] + if original: + return original + except KeyError: + pass + + try: + square1200 = body["userIllusts"][body["id"]]["url"] + except KeyError: + return + parts = square1200.rpartition("_p0")[0].split("/") + del parts[3:5] + parts[3] = "img-original" + base = "/".join(parts) + + for ext in ("jpg", "png", "gif"): + try: + url = "{}_p0.{}".format(base, ext) + self.request(url, method="HEAD") + return url + except exception.HttpError: + pass + @staticmethod def _date_from_url(url, offset=timedelta(hours=9)): try: @@ -860,6 +945,7 @@ class PixivAppAPI(): self.username = extractor._get_auth_info()[0] self.user = None + extractor.headers_web = extractor.session.headers.copy() extractor.session.headers.update({ "App-OS" : "ios", "App-OS-Version": "16.7.2", diff --git a/test/results/pixiv.py b/test/results/pixiv.py index e1d7f963..02fdde8c 100644 --- a/test/results/pixiv.py +++ b/test/results/pixiv.py @@ -184,11 +184,88 @@ __tests__ = ( { "#url" : "https://www.pixiv.net/artworks/85960783", "#comment" : "limit_sanity_level_360.png (#4327, #5180)", - "#category": ("", "pixiv", "work"), "#class" : pixiv.PixivWorkExtractor, + "#options" : {"sanity": False}, "#count" : 0, }, +{ + "#url" : "https://www.pixiv.net/en/artworks/102932581", + "#comment" : "limit_sanity_level_360.png (#4327, #5180)", + "#class" : pixiv.PixivWorkExtractor, + "#options" : {"sanity": True}, + "#urls" : "https://i.pximg.net/img-original/img/2022/11/20/00/00/49/102932581_p0.jpg", + + "caption" : "Meet a deer .", + "comment_access_control": 0, + "create_date" : "2022-11-19T15:00:00+00:00", + "date" : "dt:2022-11-19 15:00:00", + "date_url" : "dt:2022-11-19 15:00:49", + "extension" : "jpg", + "filename" : "102932581_p0", + "height" : 3840, + "id" : 102932581, + "illust_ai_type": 1, + "illust_book_style": 0, + "is_bookmarked" : False, + "is_muted" : False, + "num" : 0, + "page_count" : 1, + "rating" : "General", + "restrict" : 0, + "sanity_level" : 2, + "series" : None, + "suffix" : "", + "title" : "《 Bridge and Deer 》", + "tools" : [], + "total_bookmarks": range(1900, 3000), + "total_comments": range(3, 10), + "total_view" : range(11000, 20000), + "type" : "illust", + "url" : "https://i.pximg.net/img-original/img/2022/11/20/00/00/49/102932581_p0.jpg", + "visible" : False, + "width" : 2160, + "x_restrict" : 0, + "image_urls" : { + "mini" : "https://i.pximg.net/c/48x48/custom-thumb/img/2022/11/20/00/00/49/102932581_p0_custom1200.jpg", + "original": "https://i.pximg.net/img-original/img/2022/11/20/00/00/49/102932581_p0.jpg", + "regular" : "https://i.pximg.net/img-master/img/2022/11/20/00/00/49/102932581_p0_master1200.jpg", + "small" : "https://i.pximg.net/c/540x540_70/img-master/img/2022/11/20/00/00/49/102932581_p0_master1200.jpg", + "thumb" : "https://i.pximg.net/c/250x250_80_a2/custom-thumb/img/2022/11/20/00/00/49/102932581_p0_custom1200.jpg", + }, + "tags" : [ + "オリジナル", + "風景", + "イラスト", + "illustration", + "美しい", + "女の子", + "少女", + "deer", + "flower", + "spring", + ], + "user" : { + "account" : "805482263", + "id" : 7386235, + "is_followed": False, + "name" : "岛的鲸", + "profile_image_urls": {}, + }, +}, + +{ + "#url" : "https://www.pixiv.net/en/artworks/109487939", + "#comment" : "R-18 limit_sanity_level_360.png (#4327, #5180)", + "#class" : pixiv.PixivWorkExtractor, + "#urls" : [ + "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p0.png", + "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p1.png", + "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p2.png", + "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p3.png", + ], +}, + { "#url" : "https://www.pixiv.net/en/artworks/966412", "#category": ("", "pixiv", "work"),