gallery-dl/gallery_dl/extractor/piczel.py

# -*- coding: utf-8 -*-

# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://piczel.tv/"""

from .common import Extractor, Message
from .. import text


class PiczelExtractor(Extractor):
    """Base class for piczel extractors"""
    category = "piczel"
    directory_fmt = ("{category}", "{user[username]}")
    filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"
    archive_fmt = "{id}_{num}"
    root = "https://piczel.tv"
    api_root = root

    def items(self):
        for post in self.posts():
            post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
            post["date"] = text.parse_datetime(
                post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")

            if post["multi"]:
                images = post["images"]
                del post["images"]
                yield Message.Directory, post
                for post["num"], image in enumerate(images):
                    if "id" in image:
                        del image["id"]
                    post.update(image)
                    url = post["image"]["url"]
                    yield Message.Url, url, text.nameext_from_url(url, post)

            else:
                yield Message.Directory, post
                post["num"] = 0
                url = post["image"]["url"]
                yield Message.Url, url, text.nameext_from_url(url, post)

    def posts(self):
        """Return an iterable with all relevant post objects"""

    def _pagination(self, url, folder_id=None):
        params = {
            "from_id"  : None,
            "folder_id": folder_id,
        }

        while True:
            data = self.request(url, params=params).json()
            if not data:
                return
            params["from_id"] = data[-1]["id"]

            for post in data:
                if not folder_id or folder_id == post["folder_id"]:
                    yield post


class PiczelUserExtractor(PiczelExtractor):
    """Extractor for all images from a user's gallery"""
    subcategory = "user"
    pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?#]+)/?$"
    example = "https://piczel.tv/gallery/USER"

    def __init__(self, match):
        PiczelExtractor.__init__(self, match)
        self.user = match.group(1)

    def posts(self):
        url = "{}/api/users/{}/gallery".format(self.api_root, self.user)
        return self._pagination(url)


class PiczelFolderExtractor(PiczelExtractor):
    """Extractor for images inside a user's folder"""
    subcategory = "folder"
    directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")
    archive_fmt = "f{folder[id]}_{id}_{num}"
    pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv"
               r"/gallery/(?!image)([^/?#]+)/(\d+)")
    example = "https://piczel.tv/gallery/USER/12345"

    def __init__(self, match):
        PiczelExtractor.__init__(self, match)
        self.user, self.folder_id = match.groups()

    def posts(self):
        url = "{}/api/users/{}/gallery".format(self.api_root, self.user)
        return self._pagination(url, int(self.folder_id))


class PiczelImageExtractor(PiczelExtractor):
    """Extractor for individual images"""
    subcategory = "image"
    pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"
    example = "https://piczel.tv/gallery/image/12345"

    def __init__(self, match):
        PiczelExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def posts(self):
        url = "{}/api/gallery/{}".format(self.api_root, self.image_id)
        return (self.request(url).json(),)
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00			`# -- coding: utf-8 --`

[piczel] update API server (#4244) 2023-06-30 17:19:53 +02:00			`# Copyright 2018-2023 Mike Fährmann`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`"""Extractors for https://piczel.tv/"""`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
			`from .common import Extractor, Message`
			`from .. import text`


			`class PiczelExtractor(Extractor):`
			`"""Base class for piczel extractors"""`
			`category = "piczel"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{user[username]}")`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00			`filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"`
			`archive_fmt = "{id}_{num}"`
			`root = "https://piczel.tv"`
[piczel] update API server (#4244) 2023-06-30 17:19:53 +02:00			`api_root = root`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
			`def items(self):`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`for post in self.posts():`
			`post["tags"] = [t["title"] for t in post["tags"] if t["title"]]`
			`post["date"] = text.parse_datetime(`
			`post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")`

			`if post["multi"]:`
			`images = post["images"]`
			`del post["images"]`
			`yield Message.Directory, post`
			`for post["num"], image in enumerate(images):`
			`if "id" in image:`
			`del image["id"]`
			`post.update(image)`
			`url = post["image"]["url"]`
			`yield Message.Url, url, text.nameext_from_url(url, post)`

[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00			`else:`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`yield Message.Directory, post`
			`post["num"] = 0`
			`url = post["image"]["url"]`
			`yield Message.Url, url, text.nameext_from_url(url, post)`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`def posts(self):`
			`"""Return an iterable with all relevant post objects"""`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`def _pagination(self, url, folder_id=None):`
			`params = {`
			`"from_id" : None,`
			`"folder_id": folder_id,`
			`}`

			`while True:`
			`data = self.request(url, params=params).json()`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`if not data:`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`return`
			`params["from_id"] = data[-1]["id"]`
[piczel] fix extraction - manually filter by folder_id - extract data for single posts from embedded JSON, since the '/api/gallery/image/<id>' endpoint is no longer available 2020-03-17 17:12:28 +01:00
			`for post in data:`
			`if not folder_id or folder_id == post["folder_id"]:`
			`yield post`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
			`class PiczelUserExtractor(PiczelExtractor):`
			`"""Extractor for all images from a user's gallery"""`
			`subcategory = "user"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?#]+)/?$"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://piczel.tv/gallery/USER"`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`def __init__(self, match):`
			`PiczelExtractor.__init__(self, match)`
			`self.user = match.group(1)`

[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`def posts(self):`
[piczel] update API URLs 2020-12-07 15:56:32 +01:00			`url = "{}/api/users/{}/gallery".format(self.api_root, self.user)`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`return self._pagination(url)`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00

			`class PiczelFolderExtractor(PiczelExtractor):`
			`"""Extractor for images inside a user's folder"""`
			`subcategory = "folder"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00			`archive_fmt = "f{folder[id]}_{id}_{num}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`r"/gallery/(?!image)([^/?#]+)/(\d+)")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://piczel.tv/gallery/USER/12345"`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`def __init__(self, match):`
			`PiczelExtractor.__init__(self, match)`
			`self.user, self.folder_id = match.groups()`

[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`def posts(self):`
[piczel] update API URLs 2020-12-07 15:56:32 +01:00			`url = "{}/api/users/{}/gallery".format(self.api_root, self.user)`
[piczel] fix extraction - manually filter by folder_id - extract data for single posts from embedded JSON, since the '/api/gallery/image/<id>' endpoint is no longer available 2020-03-17 17:12:28 +01:00			`return self._pagination(url, int(self.folder_id))`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00

			`class PiczelImageExtractor(PiczelExtractor):`
			`"""Extractor for individual images"""`
			`subcategory = "image"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://piczel.tv/gallery/image/12345"`
[piczel] add user, folder and image extractors 2018-08-08 10:53:01 +02:00
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 2019-08-24 20:37:33 +02:00			`def __init__(self, match):`
			`PiczelExtractor.__init__(self, match)`
			`self.image_id = match.group(1)`

[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 2020-02-27 02:13:33 +01:00			`def posts(self):`
[piczel] update API URLs 2020-12-07 15:56:32 +01:00			`url = "{}/api/gallery/{}".format(self.api_root, self.image_id)`
[piczel] fix extraction for single images 2020-03-31 22:47:23 +02:00			`return (self.request(url).json(),)`