[pixiv] implement workaround for 'limit_sanity_level' works

(#4327, #4747, #5054, #5435, #5651, #5655) Metadata should be ~95% identical (there might be some 'date' differences) and there could be issues with R-18 works, as these require some URL manipulation to transform /c/250x250_80_a2/ thumbnail URLs into /img-original/ ones.
2024-11-22 02:32:33 +01:00 · 2024-10-04 21:07:56 +02:00 · 2024-10-04 21:07:56 +02:00 · c5be50fdaa
commit c5be50fdaa
parent d1432d02a1
3 changed files with 180 additions and 7 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -3525,6 +3525,16 @@ Description
    A value of ``0`` means no limit.


+extractor.pixiv.sanity
+----------------------
+Type
+    ``bool``
+Default
+    ``true``
+Description
+    Try to fetch ``limit_sanity_level`` works via web API.
+
+
 extractor.plurk.comments
 ------------------------
 Type
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@ -27,13 +27,14 @@ class PixivExtractor(Extractor):
    filename_fmt = "{id}_p{num}.{extension}"
    archive_fmt = "{id}{suffix}.{extension}"
    cookies_domain = None
-    url_sanity = ("https://s.pximg.net/common/images"
+    sanity_url = ("https://s.pximg.net/common/images"
                  "/limit_sanity_level_360.png")

    def _init(self):
        self.api = PixivAppAPI(self)
        self.load_ugoira = self.config("ugoira", True)
        self.max_posts = self.config("max-posts", 0)
+        self.sanity_workaround = self.config("sanity", True)

    def items(self):
        tags = self.config("tags", "japanese")
@ -102,9 +103,13 @@ class PixivExtractor(Extractor):

        elif work["page_count"] == 1:
            url = meta_single_page["original_image_url"]
-            if url == self.url_sanity:
+            if url == self.sanity_url:
+                if self.sanity_workaround:
+                    self.log.warning("%s: 'sanity_level' warning", work["id"])
+                    self._extract_ajax(work, files)
+                else:
                    self.log.warning(
-                    "Unable to download work %s ('sanity_level' warning)",
+                        "%s: Unable to download work ('sanity_level' warning)",
                        work["id"])
            else:
                files.append({"url": url})
@ -147,13 +152,93 @@ class PixivExtractor(Extractor):
                    "num": num,
                    "suffix": "_p{:02}".format(num),
                    "_ugoira_frame_index": num,
-
                }))
        else:
            files.append({
                "url": url.replace("_ugoira600x600", "_ugoira1920x1080", 1),
            })

+    def _extract_ajax(self, work, files):
+        url = "{}/ajax/illust/{}".format(self.root, work["id"])
+        data = self.request(url, headers=self.headers_web).json()
+        body = data["body"]
+
+        for key_app, key_ajax in (
+            ("title"            , "illustTitle"),
+            ("image_urls"       , "urls"),
+            ("caption"          , "illustComment"),
+            ("create_date"      , "createDate"),
+            ("width"            , "width"),
+            ("height"           , "height"),
+            ("sanity_level"     , "sl"),
+            ("total_view"       , "viewCount"),
+            ("total_comments"   , "commentCount"),
+            ("total_bookmarks"  , "bookmarkCount"),
+            ("restrict"         , "restrict"),
+            ("x_restrict"       , "xRestrict"),
+            ("illust_ai_type"   , "aiType"),
+            ("illust_book_style", "bookStyle"),
+        ):
+            work[key_app] = body[key_ajax]
+
+        work["user"] = {
+            "account"    : body["userAccount"],
+            "id"         : int(body["userId"]),
+            "is_followed": False,
+            "name"       : body["userName"],
+            "profile_image_urls": {},
+        }
+
+        work["tags"] = tags = []
+        for tag in body["tags"]["tags"]:
+            name = tag["tag"]
+            try:
+                translated_name = tag["translation"]["en"]
+            except Exception:
+                translated_name = None
+            tags.append({"name": name, "translated_name": translated_name})
+
+        url = self._extract_ajax_url(body)
+        if not url:
+            return
+
+        work["page_count"] = count = body["pageCount"]
+        if count == 1:
+            files.append({"url": url})
+        else:
+            base, _, ext = url.rpartition("_p0.")
+            for num in range(count):
+                url = "{}_p{}.{}".format(base, num, ext)
+                files.append({
+                    "url"   : url,
+                    "suffix": "_p{:02}".format(num),
+                })
+
+    def _extract_ajax_url(self, body):
+        try:
+            original = body["urls"]["original"]
+            if original:
+                return original
+        except KeyError:
+            pass
+
+        try:
+            square1200 = body["userIllusts"][body["id"]]["url"]
+        except KeyError:
+            return
+        parts = square1200.rpartition("_p0")[0].split("/")
+        del parts[3:5]
+        parts[3] = "img-original"
+        base = "/".join(parts)
+
+        for ext in ("jpg", "png", "gif"):
+            try:
+                url = "{}_p0.{}".format(base, ext)
+                self.request(url, method="HEAD")
+                return url
+            except exception.HttpError:
+                pass
+
    @staticmethod
    def _date_from_url(url, offset=timedelta(hours=9)):
        try:
@ -860,6 +945,7 @@ class PixivAppAPI():
        self.username = extractor._get_auth_info()[0]
        self.user = None

+        extractor.headers_web = extractor.session.headers.copy()
        extractor.session.headers.update({
            "App-OS"        : "ios",
            "App-OS-Version": "16.7.2",
--- a/test/results/pixiv.py
+++ b/test/results/pixiv.py
@ -184,11 +184,88 @@ __tests__ = (
 {
    "#url"     : "https://www.pixiv.net/artworks/85960783",
    "#comment" : "limit_sanity_level_360.png (#4327, #5180)",
-    "#category": ("", "pixiv", "work"),
    "#class"   : pixiv.PixivWorkExtractor,
+    "#options" : {"sanity": False},
    "#count"   : 0,
 },

+{
+    "#url"     : "https://www.pixiv.net/en/artworks/102932581",
+    "#comment" : "limit_sanity_level_360.png (#4327, #5180)",
+    "#class"   : pixiv.PixivWorkExtractor,
+    "#options" : {"sanity": True},
+    "#urls"    : "https://i.pximg.net/img-original/img/2022/11/20/00/00/49/102932581_p0.jpg",
+
+    "caption"       : "Meet a deer .",
+    "comment_access_control": 0,
+    "create_date"   : "2022-11-19T15:00:00+00:00",
+    "date"          : "dt:2022-11-19 15:00:00",
+    "date_url"      : "dt:2022-11-19 15:00:49",
+    "extension"     : "jpg",
+    "filename"      : "102932581_p0",
+    "height"        : 3840,
+    "id"            : 102932581,
+    "illust_ai_type": 1,
+    "illust_book_style": 0,
+    "is_bookmarked" : False,
+    "is_muted"      : False,
+    "num"           : 0,
+    "page_count"    : 1,
+    "rating"        : "General",
+    "restrict"      : 0,
+    "sanity_level"  : 2,
+    "series"        : None,
+    "suffix"        : "",
+    "title"         : "《 Bridge and Deer 》",
+    "tools"         : [],
+    "total_bookmarks": range(1900, 3000),
+    "total_comments": range(3, 10),
+    "total_view"    : range(11000, 20000),
+    "type"          : "illust",
+    "url"           : "https://i.pximg.net/img-original/img/2022/11/20/00/00/49/102932581_p0.jpg",
+    "visible"       : False,
+    "width"         : 2160,
+    "x_restrict"    : 0,
+    "image_urls"    : {
+        "mini"    : "https://i.pximg.net/c/48x48/custom-thumb/img/2022/11/20/00/00/49/102932581_p0_custom1200.jpg",
+        "original": "https://i.pximg.net/img-original/img/2022/11/20/00/00/49/102932581_p0.jpg",
+        "regular" : "https://i.pximg.net/img-master/img/2022/11/20/00/00/49/102932581_p0_master1200.jpg",
+        "small"   : "https://i.pximg.net/c/540x540_70/img-master/img/2022/11/20/00/00/49/102932581_p0_master1200.jpg",
+        "thumb"   : "https://i.pximg.net/c/250x250_80_a2/custom-thumb/img/2022/11/20/00/00/49/102932581_p0_custom1200.jpg",
+    },
+    "tags"          : [
+        "オリジナル",
+        "風景",
+        "イラスト",
+        "illustration",
+        "美しい",
+        "女の子",
+        "少女",
+        "deer",
+        "flower",
+        "spring",
+    ],
+    "user"          : {
+        "account"    : "805482263",
+        "id"         : 7386235,
+        "is_followed": False,
+        "name"       : "岛的鲸",
+        "profile_image_urls": {},
+    },
+},
+
+{
+    "#url"     : "https://www.pixiv.net/en/artworks/109487939",
+    "#comment" : "R-18 limit_sanity_level_360.png (#4327, #5180)",
+    "#class"   : pixiv.PixivWorkExtractor,
+    "#urls"    : [
+        "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p0.png",
+        "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p1.png",
+        "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p2.png",
+        "https://i.pximg.net/img-original/img/2023/07/01/00/06/28/109487939_p3.png",
+    ],
+},
+
 {
    "#url"     : "https://www.pixiv.net/en/artworks/966412",
    "#category": ("", "pixiv", "work"),