[vsco] add 'space' extractor (#5202)

2024-11-25 04:02:32 +01:00 · 2024-02-17 01:51:27 +01:00 · 2024-02-17 01:51:27 +01:00 · 7033cc14e9
commit 7033cc14e9
parent 770aec922d
3 changed files with 63 additions and 6 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -940,7 +940,7 @@ Consider all listed sites to potentially be NSFW.
 <tr>
    <td>VSCO</td>
    <td>https://vsco.co/</td>
-    <td>Collections, individual Images, User Profiles</td>
+    <td>Collections, individual Images, Spaces, User Profiles</td>
    <td></td>
 </tr>
 <tr>
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@ -11,8 +11,8 @@
 from .common import Extractor, Message
 from .. import text, util

-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co"
+USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)"


 class VscoExtractor(Extractor):
@ -115,7 +115,7 @@ class VscoExtractor(Extractor):
 class VscoUserExtractor(VscoExtractor):
    """Extractor for images from a user on vsco.co"""
    subcategory = "user"
-    pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])"
+    pattern = USER_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])"
    example = "https://vsco.co/USER/gallery"

    def images(self):
@ -139,7 +139,7 @@ class VscoCollectionExtractor(VscoExtractor):
    subcategory = "collection"
    directory_fmt = ("{category}", "{user}", "collection")
    archive_fmt = "c_{user}_{id}"
-    pattern = BASE_PATTERN + r"/collection/"
+    pattern = USER_PATTERN + r"/collection/"
    example = "https://vsco.co/USER/collection/12345"

    def images(self):
@ -159,10 +159,59 @@ class VscoCollectionExtractor(VscoExtractor):
        ))


+class VscoSpaceExtractor(VscoExtractor):
+    """Extractor for a vsco.co space"""
+    subcategory = "space"
+    directory_fmt = ("{category}", "space", "{user}")
+    archive_fmt = "s_{user}_{id}"
+    pattern = BASE_PATTERN + r"/spaces/([^/?#]+)"
+    example = "https://vsco.co/spaces/a1b2c3d4e5f"
+
+    def images(self):
+        url = "{}/spaces/{}".format(self.root, self.user)
+        data = self._extract_preload_state(url)
+
+        tkn = data["users"]["currentUser"]["tkn"]
+        sid = self.user
+
+        posts = data["entities"]["posts"]
+        images = data["entities"]["postImages"]
+        for post in posts.values():
+            post["image"] = images[post["image"]]
+
+        space = data["spaces"]["byId"][sid]
+        space["postsList"] = [posts[pid] for pid in space["postsList"]]
+
+        url = "{}/grpc/spaces/{}/posts".format(self.root, sid)
+        params = {}
+        return self._pagination(url, params, tkn, space)
+
+    def _pagination(self, url, params, token, data):
+        headers = {
+            "Accept"       : "application/json",
+            "Referer"      : "{}/spaces/{}".format(self.root, self.user),
+            "Content-Type" : "application/json",
+            "Authorization": "Bearer " + token,
+        }
+
+        while True:
+            for post in data["postsList"]:
+                post = self._transform_media(post["image"])
+                post["upload_date"] = post["upload_date"]["sec"] * 1000
+                yield post
+
+            cursor = data["cursor"]
+            if cursor.get("atEnd"):
+                return
+            params["cursor"] = cursor["postcursorcontext"]["postId"]
+
+            data = self.request(url, params=params, headers=headers).json()
+
+
 class VscoImageExtractor(VscoExtractor):
    """Extractor for individual images on vsco.co"""
    subcategory = "image"
-    pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)"
+    pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)"
    example = "https://vsco.co/USER/media/0123456789abcdef"

    def __init__(self, match):
--- a/test/results/vsco.py
+++ b/test/results/vsco.py
@ -38,6 +38,14 @@ __tests__ = (
    "#count"   : 80,
 },

+{
+    "#url"     : "https://vsco.co/spaces/6320a3e1e0338d1350b33fea",
+    "#category": ("", "vsco", "space"),
+    "#class"   : vsco.VscoSpaceExtractor,
+    "#pattern" : r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/[\w\s-]+\.\w+",
+    "#count"   : range(100, 150),
+},
+
 {
    "#url"     : "https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2",
    "#category": ("", "vsco", "image"),