[pinterest] support 'story' pins (#6188, #6078, #4229)

2024-11-21 18:22:30 +01:00 · 2024-10-19 17:42:01 +02:00 · 2024-10-19 17:42:01 +02:00 · 5d984f35aa
commit 5d984f35aa
parent 5477ed181d
3 changed files with 150 additions and 33 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -3400,6 +3400,16 @@ Description
    Include pins from board sections.


+extractor.pinterest.stories
+---------------------------
+Type
+    ``bool``
+Default
+    ``true``
+Description
+    Extract files from story pins.
+
+
 extractor.pinterest.videos
 --------------------------
 Type
@ -4447,7 +4457,14 @@ Description
    `fallback <extractor.*.fallback_>`_ URLs.

    Known available sizes are
-    ``4096x4096``, ``orig``, ``large``, ``medium``, and ``small``.
+
+    * ``orig``
+    * ``large``
+    * ``medium``
+    * ``small``
+    * ``4096x4096``
+    * ``900x900``
+    * ``360x360``


 extractor.twitter.logout
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@ -18,8 +18,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+"
 class PinterestExtractor(Extractor):
    """Base class for pinterest extractors"""
    category = "pinterest"
-    filename_fmt = "{category}_{id}{media_id:?_//}.{extension}"
-    archive_fmt = "{id}{media_id}"
+    filename_fmt = "{category}_{id}{media_id|page_id:?_//}.{extension}"
+    archive_fmt = "{id}{media_id|page_id}"
    root = "https://www.pinterest.com"

    def _init(self):
@ -30,6 +30,7 @@ class PinterestExtractor(Extractor):
            self.root = text.ensure_http_scheme(domain)

        self.api = PinterestAPI(self)
+        self.stories = self.config("stories", True)
        self.videos = self.config("videos", True)

    def items(self):
@ -62,6 +63,8 @@ class PinterestExtractor(Extractor):

                if "media_id" not in file:
                    pin["media_id"] = ""
+                if "page_id" not in file:
+                    pin["page_id"] = ""

                if pin["extension"] == "m3u8":
                    url = "ytdl:" + url
@ -77,37 +80,107 @@ class PinterestExtractor(Extractor):
        """Return all relevant pin objects"""

    def _extract_files(self, pin):
+        story_pin_data = pin.get("story_pin_data")
+        if story_pin_data and self.stories:
+            return self._extract_story(pin, story_pin_data)
+
        carousel_data = pin.get("carousel_data")
        if carousel_data:
-            files = []
-            for slot in carousel_data["carousel_slots"]:
-                size, image = next(iter(slot["images"].items()))
-                slot["media_id"] = slot.pop("id")
-                slot["url"] = image["url"].replace(
-                    "/" + size + "/", "/originals/", 1)
-                files.append(slot)
-            return files
+            return self._extract_carousel(pin, carousel_data)

        videos = pin.get("videos")
-        if videos:
-            if not self.videos:
-                return ()
-            pass
+        if videos and self.videos:
+            return (self._extract_video(videos),)

-            video_formats = videos["video_list"]
-            for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"):
-                if fmt in video_formats:
-                    file = video_formats[fmt]
-                    break
-            else:
-                file = max(video_formats.values(),
-                           key=lambda x: x.get("width", 0))
+        try:
+            return (pin["images"]["orig"],)
+        except Exception:
+            self.log.debug("%s: No files found", pin.get("id"))
+            return ()

-            if "V_720P" in video_formats:
-                file["_fallback"] = (video_formats["V_720P"]["url"],)
-            return (file,)
+    def _extract_story(self, pin, story):
+        files = []
+        story_id = story.get("id")

-        return (pin["images"]["orig"],)
+        for page in story["pages"]:
+            page_id = page.get("id")
+
+            for block in page["blocks"]:
+                type = block.get("type")
+
+                if type == "story_pin_image_block":
+                    if 1 == len(page["blocks"]) == len(story["pages"]):
+                        try:
+                            media = pin["images"]["orig"]
+                        except Exception:
+                            media = self._extract_image(page, block)
+                    else:
+                        media = self._extract_image(page, block)
+
+                elif type == "story_pin_video_block":
+                    video = block["video"]
+                    media = self._extract_video(video)
+                    media["media_id"] = video.get("id") or ""
+
+                elif type == "story_pin_paragraph_block":
+                    media = {"url": "text:" + block["text"],
+                             "extension": "txt",
+                             "media_id": block.get("id")}
+
+                else:
+                    self.log.warning("%s: Unsupported story block '%s'",
+                                     pin.get("id"), type)
+                    continue
+
+                media["story_id"] = story_id
+                media["page_id"] = page_id
+                files.append(media)
+
+        return files
+
+    def _extract_carousel(self, pin, carousel_data):
+        files = []
+        for slot in carousel_data["carousel_slots"]:
+            size, image = next(iter(slot["images"].items()))
+            slot["media_id"] = slot.pop("id")
+            slot["url"] = image["url"].replace(
+                "/" + size + "/", "/originals/", 1)
+            files.append(slot)
+        return files
+
+    def _extract_image(self, page, block):
+        sig = block.get("image_signature") or page["image_signature"]
+        url_base = "https://i.pinimg.com/originals/{}/{}/{}/{}.".format(
+            sig[0:2], sig[2:4], sig[4:6], sig)
+        url_jpg = url_base + "jpg"
+        url_png = url_base + "png"
+        url_webp = url_base + "webp"
+
+        try:
+            media = block["image"]["images"]["originals"]
+        except Exception:
+            media = {"url": url_jpg, "_fallback": (url_png, url_webp,)}
+
+        if media["url"] == url_jpg:
+            media["_fallback"] = (url_png, url_webp,)
+        else:
+            media["_fallback"] = (url_jpg, url_png, url_webp,)
+        media["media_id"] = sig
+
+        return media
+
+    def _extract_video(self, video):
+        video_formats = video["video_list"]
+        for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"):
+            if fmt in video_formats:
+                media = video_formats[fmt]
+                break
+        else:
+            media = max(video_formats.values(),
+                        key=lambda x: x.get("width", 0))
+        if "V_720P" in video_formats:
+            media["_fallback"] = (video_formats["V_720P"]["url"],)
+        return media


 class PinterestPinExtractor(PinterestExtractor):
--- a/test/results/pinterest.py
+++ b/test/results/pinterest.py
@ -23,9 +23,38 @@ __tests__ = (
 {
    "#url"     : "https://www.pinterest.com/pin/422564377542934214/",
    "#comment" : "video pin (#1189)",
-    "#category": ("", "pinterest", "pin"),
    "#class"   : pinterest.PinterestPinExtractor,
    "#pattern" : r"https://v\d*\.pinimg\.com/videos/mc/hls/d7/22/ff/d722ff00ab2352981b89974b37909de8.m3u8",
+    "#exception": exception.NotFoundError,
+},
+
+{
+    "#url"     : "https://jp.pinterest.com/pin/858146904010573850/",
+    "#comment" : "story pin with images",
+    "#class"   : pinterest.PinterestPinExtractor,
+    "#urls"    : (
+        "https://i.pinimg.com/originals/0f/b0/8c/0fb08c519067dd263a1fcfecea775450.jpg",
+        "https://i.pinimg.com/originals/2f/27/f3/2f27f3eb781b107ce58bf588c12a12b7.jpg",
+        "https://i.pinimg.com/originals/55/fd/df/55fddf8d26aa0d96071af52ac6a0c25f.jpg",
+    ),
+},
+
+{
+    "#url"     : "https://www.pinterest.com/pin/63824519713049795/",
+    "#comment" : "story pin with video (#6188)",
+    "#class"   : pinterest.PinterestPinExtractor,
+    "#urls"    : "ytdl:https://v1.pinimg.com/videos/iht/hls/7a/b0/cc/7ab0cc56dcbfc1508b8d650af7b0a593.m3u8",
+
+    "extension"     : "mp4",
+    "_ytdl_manifest": "hls",
+},
+
+{
+    "#url"     : "https://jp.pinterest.com/pin/851532242064221228/",
+    "#comment" : "story pin with text",
+    "#class"   : pinterest.PinterestPinExtractor,
+    "#range"   : "2",
+    "#urls"    : "text:Everskies character+outfits i made",
 },

 {
@ -37,10 +66,8 @@ __tests__ = (

 {
    "#url"     : "https://www.pinterest.com/g1952849/test-/",
-    "#category": ("", "pinterest", "board"),
    "#class"   : pinterest.PinterestBoardExtractor,
-    "#pattern" : r"https://i\.pinimg\.com/originals/",
-    "#count"   : 2,
+    "#urls"    : "https://i.pinimg.com/originals/d4/f4/7f/d4f47fa2fce4c4c28475af5d94972904.jpg",
 },

 {
@ -103,14 +130,14 @@ __tests__ = (
    "#category": ("", "pinterest", "allpins"),
    "#class"   : pinterest.PinterestAllpinsExtractor,
    "#pattern" : r"https://i\.pinimg\.com/originals/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w{3}",
-    "#count"   : 7,
+    "#count"   : 9,
 },

 {
    "#url"     : "https://www.pinterest.de/digitalmomblog/_created/",
    "#category": ("", "pinterest", "created"),
    "#class"   : pinterest.PinterestCreatedExtractor,
-    "#pattern" : r"ytdl:|https://i\.pinimg\.com/originals/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.(jpg|png)",
+    "#pattern" : r"ytdl:|https://i\.pinimg\.com/originals/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.(jpg|png|webp)",
    "#range"   : "1-10",
    "#count"   : 10,
 },