gallery-dl/gallery_dl/extractor/photobucket.py

# -*- coding: utf-8 -*-

# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://photobucket.com/"""

from .common import Extractor, Message
from .. import text, exception
import base64
import json


class PhotobucketAlbumExtractor(Extractor):
    """Extractor for albums on photobucket.com"""
    category = "photobucket"
    subcategory = "album"
    directory_fmt = ("{category}", "{username}", "{location}")
    filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
    archive_fmt = "{id}"
    pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)"
               r"/user/[^/?&#]+/library(?:/[^?&#]*)?")
    test = (
        ("https://s369.photobucket.com/user/CrpyLrkr/library", {
            "pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/",
            "count": ">= 50"
        }),
        # subalbums of main "directory"
        ("https://s271.photobucket.com/user/lakerfanryan/library/", {
            "options": (("image-filter", "False"),),
            "pattern": pattern,
            "count": 1,
        }),
        # subalbums of subalbum without images
        ("https://s271.photobucket.com/user/lakerfanryan/library/Basketball", {
            "pattern": pattern,
            "count": ">= 9",
        }),
        # private (missing JSON data)
        ("https://s1277.photobucket.com/user/sinisterkat44/library/", {
            "count": 0,
        }),
        ("https://s1110.photobucket.com/user/chndrmhn100/library/"
         "Chandu%20is%20the%20King?sort=3&page=1"),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.album_path = ""
        self.root = "https://" + match.group(1)
        self.session.headers["Referer"] = self.url

    def items(self):
        yield Message.Version, 1
        for image in self.images():
            image["titleOrFilename"] = text.unescape(image["titleOrFilename"])
            image["title"] = text.unescape(image["title"])
            image["extension"] = image["ext"]
            yield Message.Directory, image
            yield Message.Url, image["fullsizeUrl"], image

        if self.config("subalbums", True):
            for album in self.subalbums():
                album["_extractor"] = PhotobucketAlbumExtractor
                yield Message.Queue, album["url"], album

    def images(self):
        """Yield all images of the current album"""
        url = self.url
        params = {"sort": "3", "page": 1}

        while True:
            page = self.request(url, params=params).text
            json_data = text.extract(page, "collectionData:", ",\n")[0]
            if not json_data:
                msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]
                msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
                self.log.error("Unable to get JSON data%s", msg)
                return
            data = json.loads(json_data)

            yield from data["items"]["objects"]

            if data["total"] <= data["offset"] + data["pageSize"]:
                self.album_path = data["currentAlbumPath"]
                return
            params["page"] += 1

    def subalbums(self):
        """Return all subalbum objects"""
        url = self.root + "/component/Albums-SubalbumList"
        params = {
            "albumPath": self.album_path,
            "fetchSubAlbumsOnly": "true",
            "deferCollapsed": "true",
            "json": "1",
        }

        data = self.request(url, params=params).json()
        return data["body"].get("subAlbums", ())


class PhotobucketImageExtractor(Extractor):
    """Extractor for individual images from photobucket.com"""
    category = "photobucket"
    subcategory = "image"
    directory_fmt = ("{category}", "{username}")
    filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
    archive_fmt = "{username}_{id}"
    pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com"
               r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
               r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
    test = (
        (("https://s271.photobucket.com/user/lakerfanryan"
          "/media/Untitled-3-1.jpg.html"), {
            "url": "3b647deeaffc184cc48c89945f67574559c9051f",
            "keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735",
        }),
        (("https://s271.photobucket.com/user/lakerfanryan"
          "/media/IsotopeswBros.jpg.html?sort=3&o=2"), {
            "url": "12c1890c09c9cdb8a88fba7eec13f324796a8d7b",
            "keyword": "61200a223df6c06f45ac3d30c88b3f5b048ce9a8",
        }),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = match.group(1) or match.group(3)
        self.media_id = match.group(2)
        self.session.headers["Referer"] = self.url

    def items(self):
        url = "https://photobucket.com/galleryd/search.php"
        params = {"userName": self.user, "searchTerm": "", "ref": ""}

        if self.media_id:
            params["mediaId"] = self.media_id
        else:
            params["url"] = self.url

        # retry API call up to 5 times, since it can randomly fail
        tries = 0
        while tries < 5:
            data = self.request(url, method="POST", params=params).json()
            image = data["mediaDocuments"]
            if "message" not in image:
                break  # success
            tries += 1
            self.log.debug(image["message"])
        else:
            raise exception.StopExtraction(image["message"])

        # adjust metadata entries to be at least somewhat similar
        # to what the 'album' extractor provides
        if "media" in image:
            image = image["media"][image["mediaIndex"]]
            image["albumView"] = data["mediaDocuments"]["albumView"]
            image["username"] = image["ownerId"]
        else:
            image["fileUrl"] = image.pop("imageUrl")

        image.setdefault("title", "")
        image.setdefault("description", "")
        name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")
        image["ext"] = image["extension"] = ext
        image["titleOrFilename"] = image["title"] or name
        image["tags"] = image.pop("clarifaiTagList", [])

        mtype, _, mid = base64.b64decode(image["id"]).partition(b":")
        image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""

        yield Message.Version, 1
        yield Message.Directory, image
        yield Message.Url, image["fileUrl"], image
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`# -- coding: utf-8 --`

			`# Copyright 2019 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`"""Extract images from https://photobucket.com/"""`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00
			`from .common import Extractor, Message`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`from .. import text, exception`
			`import base64`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`import json`


			`class PhotobucketAlbumExtractor(Extractor):`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`"""Extractor for albums on photobucket.com"""`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`category = "photobucket"`
			`subcategory = "album"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{username}", "{location}")`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`archive_fmt = "{id}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)"`
[photobucket] replace test URL The other user deleted all of is images. 2019-11-02 20:17:08 +01:00			`r"/user/[^/?&#]+/library(?:/[^?&#]*)?")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`test = (`
[photobucket] replace test URL The other user deleted all of is images. 2019-11-02 20:17:08 +01:00			`("https://s369.photobucket.com/user/CrpyLrkr/library", {`
			`"pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/",`
			`"count": ">= 50"`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`# subalbums of main "directory"`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`("https://s271.photobucket.com/user/lakerfanryan/library/", {`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`"options": (("image-filter", "False"),),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`"pattern": pattern,`
[photobucket] improve subalbum extraction (#117) The former implementation would produce a complete list of all subalbums for each (sub)album extraction. This would for example result in a level 2 subalbum getting "extracted" twice: once through the root-album (level 0) and once through its parent album on level 1. In the current implementation only the next level of subalbums are returned, which themselves will handle their next level in a recursive fashion. 2019-01-22 21:35:09 +01:00			`"count": 1,`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`# subalbums of subalbum without images`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`("https://s271.photobucket.com/user/lakerfanryan/library/Basketball", {`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`"pattern": pattern,`
[photobucket] improve subalbum extraction (#117) The former implementation would produce a complete list of all subalbums for each (sub)album extraction. This would for example result in a level 2 subalbum getting "extracted" twice: once through the root-album (level 0) and once through its parent album on level 1. In the current implementation only the next level of subalbums are returned, which themselves will handle their next level in a recursive fashion. 2019-01-22 21:35:09 +01:00			`"count": ">= 9",`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`}),`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`# private (missing JSON data)`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`("https://s1277.photobucket.com/user/sinisterkat44/library/", {`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`"count": 0,`
			`}),`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`("https://s1110.photobucket.com/user/chndrmhn100/library/"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`"Chandu%20is%20the%20King?sort=3&page=1"),`
			`)`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`self.album_path = ""`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`self.root = "https://" + match.group(1)`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`self.session.headers["Referer"] = self.url`

[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`def items(self):`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`yield Message.Version, 1`
			`for image in self.images():`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`image["titleOrFilename"] = text.unescape(image["titleOrFilename"])`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`image["title"] = text.unescape(image["title"])`
			`image["extension"] = image["ext"]`
			`yield Message.Directory, image`
			`yield Message.Url, image["fullsizeUrl"], image`

[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`if self.config("subalbums", True):`
			`for album in self.subalbums():`
provide type information for Queue messages Child extractors are now directly constructed with Extractor.from_url() if the extractor class is known beforehand, instead of using extractor.find() and searching through all possible extractor classes. 2019-02-12 21:26:41 +01:00			`album["_extractor"] = PhotobucketAlbumExtractor`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`yield Message.Queue, album["url"], album`

[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`def images(self):`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`"""Yield all images of the current album"""`
			`url = self.url`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`params = {"sort": "3", "page": 1}`

			`while True:`
			`page = self.request(url, params=params).text`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`json_data = text.extract(page, "collectionData:", ",\n")[0]`
			`if not json_data:`
			`msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]`
			`msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""`
			`self.log.error("Unable to get JSON data%s", msg)`
			`return`
			`data = json.loads(json_data)`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00
			`yield from data["items"]["objects"]`

			`if data["total"] <= data["offset"] + data["pageSize"]:`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`self.album_path = data["currentAlbumPath"]`
[photobucket] add 'album' extractor (#117) 2019-01-20 16:19:13 +01:00			`return`
			`params["page"] += 1`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00
			`def subalbums(self):`
[photobucket] improve subalbum extraction (#117) The former implementation would produce a complete list of all subalbums for each (sub)album extraction. This would for example result in a level 2 subalbum getting "extracted" twice: once through the root-album (level 0) and once through its parent album on level 1. In the current implementation only the next level of subalbums are returned, which themselves will handle their next level in a recursive fashion. 2019-01-22 21:35:09 +01:00			`"""Return all subalbum objects"""`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00			`url = self.root + "/component/Albums-SubalbumList"`
[photobucket] improve subalbum extraction (#117) The former implementation would produce a complete list of all subalbums for each (sub)album extraction. This would for example result in a level 2 subalbum getting "extracted" twice: once through the root-album (level 0) and once through its parent album on level 1. In the current implementation only the next level of subalbums are returned, which themselves will handle their next level in a recursive fashion. 2019-01-22 21:35:09 +01:00			`params = {`
			`"albumPath": self.album_path,`
			`"fetchSubAlbumsOnly": "true",`
			`"deferCollapsed": "true",`
			`"json": "1",`
			`}`
[photobucket] download subalbums (#117) 2019-01-21 19:55:05 +01:00
			`data = self.request(url, params=params).json()`
[photobucket] don't crash if JSON data is missing 2019-01-27 20:37:22 +01:00			`return data["body"].get("subAlbums", ())`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00

			`class PhotobucketImageExtractor(Extractor):`
			`"""Extractor for individual images from photobucket.com"""`
			`category = "photobucket"`
			`subcategory = "image"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{username}")`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"`
			`archive_fmt = "{username}_{id}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com"`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`r"\|/user/([^/?&#]+)/media/[^?&#]+\.html)")`
			`test = (`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`(("https://s271.photobucket.com/user/lakerfanryan"`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`"/media/Untitled-3-1.jpg.html"), {`
update extraction result tests 2019-02-02 15:37:54 +01:00			`"url": "3b647deeaffc184cc48c89945f67574559c9051f",`
			`"keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735",`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`}),`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`(("https://s271.photobucket.com/user/lakerfanryan"`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`"/media/IsotopeswBros.jpg.html?sort=3&o=2"), {`
update extraction result tests 2019-02-02 15:37:54 +01:00			`"url": "12c1890c09c9cdb8a88fba7eec13f324796a8d7b",`
			`"keyword": "61200a223df6c06f45ac3d30c88b3f5b048ce9a8",`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`self.user = match.group(1) or match.group(3)`
			`self.media_id = match.group(2)`
			`self.session.headers["Referer"] = self.url`

			`def items(self):`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`url = "https://photobucket.com/galleryd/search.php"`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`params = {"userName": self.user, "searchTerm": "", "ref": ""}`

			`if self.media_id:`
			`params["mediaId"] = self.media_id`
			`else:`
			`params["url"] = self.url`

			`# retry API call up to 5 times, since it can randomly fail`
			`tries = 0`
			`while tries < 5:`
			`data = self.request(url, method="POST", params=params).json()`
			`image = data["mediaDocuments"]`
			`if "message" not in image:`
			`break # success`
			`tries += 1`
embed error messages in StopExtraction exceptions 2019-10-28 16:06:36 +01:00			`self.log.debug(image["message"])`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`else:`
embed error messages in StopExtraction exceptions 2019-10-28 16:06:36 +01:00			`raise exception.StopExtraction(image["message"])`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00
			`# adjust metadata entries to be at least somewhat similar`
[photobucket] use HTTPS 2019-04-03 18:30:45 +02:00			`# to what the 'album' extractor provides`
[photobucket] add 'image' extractor (#117) 2019-01-22 17:24:43 +01:00			`if "media" in image:`
			`image = image["media"][image["mediaIndex"]]`
			`image["albumView"] = data["mediaDocuments"]["albumView"]`
			`image["username"] = image["ownerId"]`
			`else:`
			`image["fileUrl"] = image.pop("imageUrl")`

			`image.setdefault("title", "")`
			`image.setdefault("description", "")`
			`name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")`
			`image["ext"] = image["extension"] = ext`
			`image["titleOrFilename"] = image["title"] or name`
			`image["tags"] = image.pop("clarifaiTagList", [])`

			`mtype, _, mid = base64.b64decode(image["id"]).partition(b":")`
			`image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""`

			`yield Message.Version, 1`
			`yield Message.Directory, image`
			`yield Message.Url, image["fileUrl"], image`