gallery-dl/gallery_dl/extractor/photobucket.py

# -*- coding: utf-8 -*-

# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://photobucket.com/"""

from .common import Extractor, Message
from .. import text, exception
import binascii
import json


class PhotobucketAlbumExtractor(Extractor):
    """Extractor for albums on photobucket.com"""
    category = "photobucket"
    subcategory = "album"
    directory_fmt = ("{category}", "{username}", "{location}")
    filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
    archive_fmt = "{id}"
    pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)"
               r"/user/[^/?&#]+/library(?:/[^?&#]*)?")
    example = "https://s123.photobucket.com/user/USER/library"

    def __init__(self, match):
        self.root = "https://" + match.group(1)
        Extractor.__init__(self, match)

    def _init(self):
        self.session.headers["Referer"] = self.url

    def items(self):
        for image in self.images():
            image["titleOrFilename"] = text.unescape(image["titleOrFilename"])
            image["title"] = text.unescape(image["title"])
            image["extension"] = image["ext"]
            yield Message.Directory, image
            yield Message.Url, image["fullsizeUrl"], image

        if self.config("subalbums", True):
            for album in self.subalbums():
                album["_extractor"] = PhotobucketAlbumExtractor
                yield Message.Queue, album["url"], album

    def images(self):
        """Yield all images of the current album"""
        url = self.url
        params = {"sort": "3", "page": 1}

        while True:
            page = self.request(url, params=params).text
            json_data = text.extract(page, "collectionData:", ",\n")[0]
            if not json_data:
                msg = text.extr(page, 'libraryPrivacyBlock">', "</div>")
                msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
                self.log.error("Unable to get JSON data%s", msg)
                return
            data = json.loads(json_data)

            yield from data["items"]["objects"]

            if data["total"] <= data["offset"] + data["pageSize"]:
                self.album_path = data["currentAlbumPath"]
                return
            params["page"] += 1

    def subalbums(self):
        """Return all subalbum objects"""
        url = self.root + "/component/Albums-SubalbumList"
        params = {
            "albumPath": self.album_path,
            "fetchSubAlbumsOnly": "true",
            "deferCollapsed": "true",
            "json": "1",
        }

        data = self.request(url, params=params).json()
        return data["body"].get("subAlbums", ())


class PhotobucketImageExtractor(Extractor):
    """Extractor for individual images from photobucket.com"""
    category = "photobucket"
    subcategory = "image"
    directory_fmt = ("{category}", "{username}")
    filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
    archive_fmt = "{username}_{id}"
    pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com"
               r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
               r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
    example = "https://s123.photobucket.com/user/USER/media/NAME.EXT.html"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1) or match.group(3)
        self.media_id = match.group(2)

    def _init(self):
        self.session.headers["Referer"] = self.url

    def items(self):
        url = "https://photobucket.com/galleryd/search.php"
        params = {"userName": self.user, "searchTerm": "", "ref": ""}

        if self.media_id:
            params["mediaId"] = self.media_id
        else:
            params["url"] = self.url

        # retry API call up to 5 times, since it can randomly fail
        tries = 0
        while tries < 5:
            data = self.request(url, method="POST", params=params).json()
            image = data["mediaDocuments"]
            if "message" not in image:
                break  # success
            tries += 1
            self.log.debug(image["message"])
        else:
            raise exception.StopExtraction(image["message"])

        # adjust metadata entries to be at least somewhat similar
        # to what the 'album' extractor provides
        if "media" in image:
            image = image["media"][image["mediaIndex"]]
            image["albumView"] = data["mediaDocuments"]["albumView"]
            image["username"] = image["ownerId"]
        else:
            image["fileUrl"] = image.pop("imageUrl")

        image.setdefault("title", "")
        image.setdefault("description", "")
        name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")
        image["ext"] = image["extension"] = ext
        image["titleOrFilename"] = image["title"] or name
        image["tags"] = image.pop("clarifaiTagList", [])

        mtype, _, mid = binascii.a2b_base64(image["id"]).partition(b":")
        image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""

        yield Message.Directory, image
        yield Message.Url, image["fileUrl"], image
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`# -- coding: utf-8 --`

remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`# Copyright 2019-2023 Mike Fährmann`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`"""Extractors for https://photobucket.com/"""`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00
			`from .common import Extractor, Message`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`from .. import text, exception`
replace remaining instances of base64 with binascii 2023-03-02 18:25:47 +01:00			`import binascii`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`import json`


			`class PhotobucketAlbumExtractor(Extractor):`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`"""Extractor for albums on photobucket.com"""`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`category = "photobucket"`
			`subcategory = "album"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{username}", "{location}")`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`archive_fmt = "{id}"`
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 2021-12-29 22:39:29 +01:00			`pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)"`
			`r"/user/[^/?&#]+/library(?:/[^?&#]*)?")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://s123.photobucket.com/user/USER/library"`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00
			`def __init__(self, match):`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`self.root = "https://" + match.group(1)`
decouple extractor initialization Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'. 2023-07-25 20:09:44 +02:00			`Extractor.__init__(self, match)`

			`def _init(self):`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`self.session.headers["Referer"] = self.url`

[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`def items(self):`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`for image in self.images():`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`image["titleOrFilename"] = text.unescape(image["titleOrFilename"])`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`image["title"] = text.unescape(image["title"])`
			`image["extension"] = image["ext"]`
			`yield Message.Directory, image`
			`yield Message.Url, image["fullsizeUrl"], image`

[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`if self.config("subalbums", True):`
			`for album in self.subalbums():`
provide type information for Queue messages Child extractors are now directly constructed with Extractor.from_url() if the extractor class is known beforehand, instead of using extractor.find() and searching through all possible extractor classes. 2019-02-12 21:26:41 +01:00			`album["_extractor"] = PhotobucketAlbumExtractor`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`yield Message.Queue, album["url"], album`

[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`def images(self):`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`"""Yield all images of the current album"""`
			`url = self.url`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`params = {"sort": "3", "page": 1}`

			`while True:`
			`page = self.request(url, params=params).text`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`json_data = text.extract(page, "collectionData:", ",\n")[0]`
			`if not json_data:`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`msg = text.extr(page, 'libraryPrivacyBlock">', "</div>")`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""`
			`self.log.error("Unable to get JSON data%s", msg)`
			`return`
			`data = json.loads(json_data)`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00
			`yield from data["items"]["objects"]`

			`if data["total"] <= data["offset"] + data["pageSize"]:`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`self.album_path = data["currentAlbumPath"]`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`return`
			`params["page"] += 1`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00
			`def subalbums(self):`
[photobucket] improve subalbum extraction (#117) The former implementation would produce a complete list of all subalbums for each (sub)album extraction. This would for example result in a level 2 subalbum getting "extracted" twice: once through the root-album (level 0) and once through its parent album on level 1. In the current implementation only the next level of subalbums are returned, which themselves will handle their next level in a recursive fashion. 2019-01-22 21:35:09 +01:00			`"""Return all subalbum objects"""`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`url = self.root + "/component/Albums-SubalbumList"`
[photobucket] improve subalbum extraction (#117) The former implementation would produce a complete list of all subalbums for each (sub)album extraction. This would for example result in a level 2 subalbum getting "extracted" twice: once through the root-album (level 0) and once through its parent album on level 1. In the current implementation only the next level of subalbums are returned, which themselves will handle their next level in a recursive fashion. 2019-01-22 21:35:09 +01:00			`params = {`
			`"albumPath": self.album_path,`
			`"fetchSubAlbumsOnly": "true",`
			`"deferCollapsed": "true",`
			`"json": "1",`
			`}`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00
			`data = self.request(url, params=params).json()`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`return data["body"].get("subAlbums", ())`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00

			`class PhotobucketImageExtractor(Extractor):`
			`"""Extractor for individual images from photobucket.com"""`
			`category = "photobucket"`
			`subcategory = "image"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{username}")`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"`
			`archive_fmt = "{username}_{id}"`
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 2021-12-29 22:39:29 +01:00			`pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com"`
			`r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"`
			`r"\|/user/([^/?&#]+)/media/[^?&#]+\.html)")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://s123.photobucket.com/user/USER/media/NAME.EXT.html"`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`self.user = match.group(1) or match.group(3)`
			`self.media_id = match.group(2)`
decouple extractor initialization Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'. 2023-07-25 20:09:44 +02:00
			`def _init(self):`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`self.session.headers["Referer"] = self.url`

			`def items(self):`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`url = "https://photobucket.com/galleryd/search.php"`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`params = {"userName": self.user, "searchTerm": "", "ref": ""}`

			`if self.media_id:`
			`params["mediaId"] = self.media_id`
			`else:`
			`params["url"] = self.url`

			`# retry API call up to 5 times, since it can randomly fail`
			`tries = 0`
			`while tries < 5:`
			`data = self.request(url, method="POST", params=params).json()`
			`image = data["mediaDocuments"]`
			`if "message" not in image:`
			`break # success`
			`tries += 1`
embed error messages in StopExtraction exceptions 2019-10-28 16:06:36 +01:00			`self.log.debug(image["message"])`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`else:`
embed error messages in StopExtraction exceptions 2019-10-28 16:06:36 +01:00			`raise exception.StopExtraction(image["message"])`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00
			`# adjust metadata entries to be at least somewhat similar`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`# to what the 'album' extractor provides`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`if "media" in image:`
			`image = image["media"][image["mediaIndex"]]`
			`image["albumView"] = data["mediaDocuments"]["albumView"]`
			`image["username"] = image["ownerId"]`
			`else:`
			`image["fileUrl"] = image.pop("imageUrl")`

			`image.setdefault("title", "")`
			`image.setdefault("description", "")`
			`name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")`
			`image["ext"] = image["extension"] = ext`
			`image["titleOrFilename"] = image["title"] or name`
			`image["tags"] = image.pop("clarifaiTagList", [])`

replace remaining instances of base64 with binascii 2023-03-02 18:25:47 +01:00			`mtype, _, mid = binascii.a2b_base64(image["id"]).partition(b":")`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""`

			`yield Message.Directory, image`
			`yield Message.Url, image["fileUrl"], image`