gallery-dl/gallery_dl/extractor/unsplash.py

# -*- coding: utf-8 -*-

# Copyright 2021-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://unsplash.com/"""

from .common import Extractor, Message
from .. import text, util

BASE_PATTERN = r"(?:https?://)?unsplash\.com"


class UnsplashExtractor(Extractor):
    """Base class for unsplash extractors"""
    category = "unsplash"
    directory_fmt = ("{category}", "{user[username]}")
    filename_fmt = "{id}.{extension}"
    archive_fmt = "{id}"
    root = "https://unsplash.com"
    page_start = 1
    per_page = 20

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.item = match.group(1)

    def items(self):
        fmt = self.config("format") or "raw"
        metadata = self.metadata()

        for photo in self.photos():
            util.delete_items(
                photo, ("current_user_collections", "related_collections"))
            url = photo["urls"][fmt]
            text.nameext_from_url(url, photo)

            if metadata:
                photo.update(metadata)
            photo["extension"] = "jpg"
            photo["date"] = text.parse_datetime(photo["created_at"])
            if "tags" in photo:
                photo["tags"] = [t["title"] for t in photo["tags"]]

            yield Message.Directory, photo
            yield Message.Url, url, photo

    @staticmethod
    def metadata():
        return None

    def skip(self, num):
        pages = num // self.per_page
        self.page_start += pages
        return pages * self.per_page

    def _pagination(self, url, params, results=False):
        params["per_page"] = self.per_page
        params["page"] = self.page_start

        while True:
            photos = self.request(url, params=params).json()
            if results:
                photos = photos["results"]
            yield from photos

            if len(photos) < self.per_page:
                return
            params["page"] += 1


class UnsplashImageExtractor(UnsplashExtractor):
    """Extractor for a single unsplash photo"""
    subcategory = "image"
    pattern = BASE_PATTERN + r"/photos/([^/?#]+)"
    example = "https://unsplash.com/photos/ID"

    def photos(self):
        url = "{}/napi/photos/{}".format(self.root, self.item)
        return (self.request(url).json(),)


class UnsplashUserExtractor(UnsplashExtractor):
    """Extractor for all photos of an unsplash user"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"/@(\w+)/?$"
    example = "https://unsplash.com/@USER"

    def photos(self):
        url = "{}/napi/users/{}/photos".format(self.root, self.item)
        params = {"order_by": "latest"}
        return self._pagination(url, params)


class UnsplashFavoriteExtractor(UnsplashExtractor):
    """Extractor for all likes of an unsplash user"""
    subcategory = "favorite"
    pattern = BASE_PATTERN + r"/@(\w+)/likes"
    example = "https://unsplash.com/@USER/likes"

    def photos(self):
        url = "{}/napi/users/{}/likes".format(self.root, self.item)
        params = {"order_by": "latest"}
        return self._pagination(url, params)


class UnsplashCollectionExtractor(UnsplashExtractor):
    """Extractor for an unsplash collection"""
    subcategory = "collection"
    pattern = BASE_PATTERN + r"/collections/([^/?#]+)(?:/([^/?#]+))?"
    example = "https://unsplash.com/collections/12345/TITLE"

    def __init__(self, match):
        UnsplashExtractor.__init__(self, match)
        self.title = match.group(2) or ""

    def metadata(self):
        return {"collection_id": self.item, "collection_title": self.title}

    def photos(self):
        url = "{}/napi/collections/{}/photos".format(self.root, self.item)
        params = {"order_by": "latest"}
        return self._pagination(url, params)


class UnsplashSearchExtractor(UnsplashExtractor):
    """Extractor for unsplash search results"""
    subcategory = "search"
    pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"
    example = "https://unsplash.com/s/photos/QUERY"

    def __init__(self, match):
        UnsplashExtractor.__init__(self, match)
        self.query = match.group(2)

    def photos(self):
        url = self.root + "/napi/search/photos"
        params = {"query": text.unquote(self.item.replace('-', ' '))}
        if self.query:
            params.update(text.parse_query(self.query))
        return self._pagination(url, params, True)
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`# -- coding: utf-8 --`

remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`# Copyright 2021-2023 Mike Fährmann`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://unsplash.com/"""`

			`from .common import Extractor, Message`
			`from .. import text, util`

			`BASE_PATTERN = r"(?:https?://)?unsplash\.com"`


			`class UnsplashExtractor(Extractor):`
			`"""Base class for unsplash extractors"""`
			`category = "unsplash"`
			`directory_fmt = ("{category}", "{user[username]}")`
			`filename_fmt = "{id}.{extension}"`
			`archive_fmt = "{id}"`
			`root = "https://unsplash.com"`
[unsplash] implement 'skip()' 2021-01-23 16:33:20 +01:00			`page_start = 1`
			`per_page = 20`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.item = match.group(1)`

			`def items(self):`
[unsplash] add 'format' option (#1197) 2021-01-21 22:41:49 +01:00			`fmt = self.config("format") or "raw"`
[unsplash] add collection_title and …_id metadata fields (#2670) 2022-06-12 18:26:20 +02:00			`metadata = self.metadata()`

[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`for photo in self.photos():`
			`util.delete_items(`
[unsplash] fix typo 2021-01-20 22:51:02 +01:00			`photo, ("current_user_collections", "related_collections"))`
[unsplash] add 'format' option (#1197) 2021-01-21 22:41:49 +01:00			`url = photo["urls"][fmt]`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`text.nameext_from_url(url, photo)`

[unsplash] add collection_title and …_id metadata fields (#2670) 2022-06-12 18:26:20 +02:00			`if metadata:`
			`photo.update(metadata)`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`photo["extension"] = "jpg"`
			`photo["date"] = text.parse_datetime(photo["created_at"])`
			`if "tags" in photo:`
			`photo["tags"] = [t["title"] for t in photo["tags"]]`

			`yield Message.Directory, photo`
			`yield Message.Url, url, photo`

[unsplash] add collection_title and …_id metadata fields (#2670) 2022-06-12 18:26:20 +02:00			`@staticmethod`
			`def metadata():`
			`return None`

[unsplash] implement 'skip()' 2021-01-23 16:33:20 +01:00			`def skip(self, num):`
			`pages = num // self.per_page`
			`self.page_start += pages`
			`return pages * self.per_page`

[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`def _pagination(self, url, params, results=False):`
[unsplash] implement 'skip()' 2021-01-23 16:33:20 +01:00			`params["per_page"] = self.per_page`
			`params["page"] = self.page_start`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00
			`while True:`
			`photos = self.request(url, params=params).json()`
			`if results:`
			`photos = photos["results"]`
			`yield from photos`

[unsplash] implement 'skip()' 2021-01-23 16:33:20 +01:00			`if len(photos) < self.per_page:`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`return`
			`params["page"] += 1`


			`class UnsplashImageExtractor(UnsplashExtractor):`
			`"""Extractor for a single unsplash photo"""`
			`subcategory = "image"`
[unsplash] add 'collection' extractor (#1197) 2021-01-21 22:27:43 +01:00			`pattern = BASE_PATTERN + r"/photos/([^/?#]+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://unsplash.com/photos/ID"`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00
			`def photos(self):`
			`url = "{}/napi/photos/{}".format(self.root, self.item)`
			`return (self.request(url).json(),)`


			`class UnsplashUserExtractor(UnsplashExtractor):`
			`"""Extractor for all photos of an unsplash user"""`
			`subcategory = "user"`
			`pattern = BASE_PATTERN + r"/@(\w+)/?$"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://unsplash.com/@USER"`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00
			`def photos(self):`
			`url = "{}/napi/users/{}/photos".format(self.root, self.item)`
			`params = {"order_by": "latest"}`
			`return self._pagination(url, params)`


			`class UnsplashFavoriteExtractor(UnsplashExtractor):`
			`"""Extractor for all likes of an unsplash user"""`
			`subcategory = "favorite"`
			`pattern = BASE_PATTERN + r"/@(\w+)/likes"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://unsplash.com/@USER/likes"`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00
			`def photos(self):`
			`url = "{}/napi/users/{}/likes".format(self.root, self.item)`
			`params = {"order_by": "latest"}`
			`return self._pagination(url, params)`


[unsplash] add 'collection' extractor (#1197) 2021-01-21 22:27:43 +01:00			`class UnsplashCollectionExtractor(UnsplashExtractor):`
			`"""Extractor for an unsplash collection"""`
			`subcategory = "collection"`
[unsplash] add collection_title and …_id metadata fields (#2670) 2022-06-12 18:26:20 +02:00			`pattern = BASE_PATTERN + r"/collections/([^/?#]+)(?:/([^/?#]+))?"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://unsplash.com/collections/12345/TITLE"`
[unsplash] add collection_title and …_id metadata fields (#2670) 2022-06-12 18:26:20 +02:00
			`def __init__(self, match):`
			`UnsplashExtractor.__init__(self, match)`
			`self.title = match.group(2) or ""`

			`def metadata(self):`
			`return {"collection_id": self.item, "collection_title": self.title}`
[unsplash] add 'collection' extractor (#1197) 2021-01-21 22:27:43 +01:00
			`def photos(self):`
			`url = "{}/napi/collections/{}/photos".format(self.root, self.item)`
			`params = {"order_by": "latest"}`
			`return self._pagination(url, params)`


[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`class UnsplashSearchExtractor(UnsplashExtractor):`
			`"""Extractor for unsplash search results"""`
			`subcategory = "search"`
allow '/' and '?' in URL queries 2022-10-02 19:02:05 +02:00			`pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://unsplash.com/s/photos/QUERY"`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00
			`def __init__(self, match):`
			`UnsplashExtractor.__init__(self, match)`
			`self.query = match.group(2)`

			`def photos(self):`
			`url = self.root + "/napi/search/photos"`
[unsplash] replace dash with space in search API queries (#2429) 2022-03-19 16:00:05 +01:00			`params = {"query": text.unquote(self.item.replace('-', ' '))}`
[unsplash] add extractors (#1197) for - single photos (/photos/ID) - user profiles (/@USER) - user likes (/@USER/likes) - search results (/s/photos/SEARCH) 2021-01-19 02:23:39 +01:00			`if self.query:`
			`params.update(text.parse_query(self.query))`
			`return self._pagination(url, params, True)`