gallery-dl/gallery_dl/extractor/imgbb.py

# -*- coding: utf-8 -*-

# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://imgbb.com/"""

from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import json


class ImgbbExtractor(Extractor):
    """Base class for imgbb extractors"""
    category = "imgbb"
    directory_fmt = ("{category}", "{user}")
    filename_fmt = "{title} {id}.{extension}"
    archive_fmt = "{id}"
    root = "https://imgbb.com"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.page_url = self.sort = None

    def items(self):
        self.login()

        url = self.page_url
        params = {"sort": self.sort}
        while True:
            response = self.request(url, params=params, allow_redirects=False)
            if response.status_code < 300:
                break
            url = response.headers["location"]
            if url.startswith(self.root):
                raise exception.NotFoundError(self.subcategory)

        page = response.text
        data = self.metadata(page)
        first = True

        yield Message.Version, 1
        for img in self.images(page):
            image = {
                "id"       : img["url_viewer"].rpartition("/")[2],
                "user"     : img["user"]["username"] if "user" in img else "",
                "title"    : text.unescape(img["title"]),
                "url"      : img["image"]["url"],
                "extension": img["image"]["extension"],
                "size"     : text.parse_int(img["image"]["size"]),
                "width"    : text.parse_int(img["width"]),
                "height"   : text.parse_int(img["height"]),
            }
            image.update(data)
            if first:
                first = False
                yield Message.Directory, data
            yield Message.Url, image["url"], image

    def login(self):
        username, password = self._get_auth_info()
        if username:
            self._update_cookies(self._login_impl(username, password))

    @cache(maxage=360*24*3600, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        url = self.root + "/login"
        page = self.request(url).text
        token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]

        headers = {"Referer": url}
        data = {
            "auth_token"   : token,
            "login-subject": username,
            "password"     : password,
        }
        response = self.request(url, method="POST", headers=headers, data=data)

        if not response.history:
            raise exception.AuthenticationError()
        return self.session.cookies

    def _pagination(self, page, endpoint, params):
        data = None
        seek, pos = text.extract(page, 'data-seek="', '"')
        tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
        params["action"] = "list"
        params["list"] = "images"
        params["sort"] = self.sort
        params["seek"] = seek
        params["page"] = 2
        params["auth_token"] = tokn

        while True:
            for img in text.extract_iter(page, "data-object='", "'"):
                yield json.loads(text.unquote(img))
            if data:
                if params["seek"] == data["seekEnd"]:
                    return
                params["seek"] = data["seekEnd"]
                params["page"] += 1
            elif not seek or 'class="pagination-next"' not in page:
                return
            data = self.request(endpoint, method="POST", data=params).json()
            page = data["html"]


class ImgbbAlbumExtractor(ImgbbExtractor):
    """Extractor for albums on imgbb.com"""
    subcategory = "album"
    directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")
    pattern = r"(?:https?://)?ibb\.co/album/([^/?&#]+)/?(?:\?([^#]+))?"
    test = (
        ("https://ibb.co/album/i5PggF", {
            "range": "1-80",
            "url": "70afec9fcc3a6de62a6b644b487d892d8d47cf1a",
            "keyword": "569e1d88ebdd27655387559cdf1cd526a3e1ab69",
        }),
        ("https://ibb.co/album/i5PggF?sort=title_asc", {
            "range": "1-80",
            "url": "a2dfc58fe3348fa37e242082bd5a85eaa490d0a5",
            "keyword": "5bb79c82411c3770d673fac64a0a98fa28111c3b",
        }),
        # no user data (#471)
        ("https://ibb.co/album/kYKpwF", {
            "url": "ac0abcfcb89f4df6adc2f7e4ff872f3b03ef1bc7",
            "keyword": {"user": ""},
        }),
        # deleted
        ("https://ibb.co/album/fDArrF", {
            "exception": exception.NotFoundError,
        }),
        # private
        ("https://ibb.co/album/hqgWrF", {
            "exception": exception.HttpError,
        })
    )

    def __init__(self, match):
        ImgbbExtractor.__init__(self, match)
        self.album_name = None
        self.album_id = match.group(1)
        self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
        self.page_url = "https://ibb.co/album/" + self.album_id

    def metadata(self, page):
        album, pos = text.extract(page, '"og:title" content="', '"')
        user , pos = text.extract(page, 'rel="author">', '<', pos)
        return {
            "album_id"  : self.album_id,
            "album_name": text.unescape(album),
            "user"      : user.lower() if user else "",
        }

    def images(self, page):
        url = text.extract(page, '"og:url" content="', '"')[0]
        album_id = url.rpartition("/")[2].partition("?")[0]

        return self._pagination(page, "https://ibb.co/json", {
            "from"      : "album",
            "albumid"   : album_id,
            "params_hidden[list]"   : "images",
            "params_hidden[from]"   : "album",
            "params_hidden[albumid]": album_id,
        })


class ImgbbUserExtractor(ImgbbExtractor):
    """Extractor for user profiles in imgbb.com"""
    subcategory = "user"
    pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"
    test = ("https://folkie.imgbb.com", {
        "range": "1-80",
        "pattern": r"https?://i\.ibb\.co/\w+/[^/?&#]+",
    })

    def __init__(self, match):
        ImgbbExtractor.__init__(self, match)
        self.user = match.group(1)
        self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
        self.page_url = "https://{}.imgbb.com/".format(self.user)

    def metadata(self, page):
        return {"user": self.user}

    def images(self, page):
        user = text.extract(page, '.obj.resource={"id":"', '"')[0]
        return self._pagination(page, self.page_url + "json", {
            "from"      : "user",
            "userid"    : user,
            "params_hidden[userid]": user,
            "params_hidden[from]"  : "user",
        })


class ImgbbImageExtractor(ImgbbExtractor):
    subcategory = "image"
    pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)"
    test = ("https://ibb.co/fUqh5b", {
        "pattern": r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg",
        "content": "c5a0965178a8b357acd8aa39660092918c63795e",
        "keyword": {
            "id"    : "fUqh5b",
            "title" : "Arundel Ireeman 5",
            "url"   : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg",
            "width" : 960,
            "height": 719,
            "user"  : "folkie",
            "extension": "jpg",
        },
    })

    def __init__(self, match):
        ImgbbExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def items(self):
        url = "https://ibb.co/" + self.image_id
        extr = text.extract_from(self.request(url).text)

        image = {
            "id"    : self.image_id,
            "title" : text.unescape(extr('"og:title" content="', '"')),
            "url"   : extr('"og:image" content="', '"'),
            "width" : text.parse_int(extr('"og:image:width" content="', '"')),
            "height": text.parse_int(extr('"og:image:height" content="', '"')),
            "user"  : extr('rel="author">', '<').lower(),
        }
        image["extension"] = text.ext_from_url(image["url"])

        yield Message.Version, 1
        yield Message.Directory, image
        yield Message.Url, image["url"], image
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`# -- coding: utf-8 --`

			`# Copyright 2019 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://imgbb.com/"""`

			`from .common import Extractor, Message`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`from .. import text, exception`
			`from ..cache import cache`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`import json`


[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`class ImgbbExtractor(Extractor):`
			`"""Base class for imgbb extractors"""`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`category = "imgbb"`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00			`directory_fmt = ("{category}", "{user}")`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`filename_fmt = "{title} {id}.{extension}"`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`archive_fmt = "{id}"`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`root = "https://imgbb.com"`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`self.page_url = self.sort = None`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
			`def items(self):`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`self.login()`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00
			`url = self.page_url`
			`params = {"sort": self.sort}`
			`while True:`
			`response = self.request(url, params=params, allow_redirects=False)`
			`if response.status_code < 300:`
			`break`
			`url = response.headers["location"]`
			`if url.startswith(self.root):`
			`raise exception.NotFoundError(self.subcategory)`

[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`page = response.text`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`data = self.metadata(page)`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`first = True`

[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`yield Message.Version, 1`
			`for img in self.images(page):`
			`image = {`
			`"id" : img["url_viewer"].rpartition("/")[2],`
[imgbb] fix error in galleries without user info (closes #471) 2019-11-10 17:10:51 +01:00			`"user" : img["user"]["username"] if "user" in img else "",`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`"title" : text.unescape(img["title"]),`
			`"url" : img["image"]["url"],`
			`"extension": img["image"]["extension"],`
			`"size" : text.parse_int(img["image"]["size"]),`
			`"width" : text.parse_int(img["width"]),`
			`"height" : text.parse_int(img["height"]),`
			`}`
			`image.update(data)`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`if first:`
			`first = False`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`yield Message.Directory, data`
			`yield Message.Url, image["url"], image`

			`def login(self):`
			`username, password = self._get_auth_info()`
			`if username:`
			`self._update_cookies(self._login_impl(username, password))`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`@cache(maxage=360243600, keyarg=1)`
			`def _login_impl(self, username, password):`
			`self.log.info("Logging in as %s", username)`

			`url = self.root + "/login"`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`page = self.request(url).text`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`headers = {"Referer": url}`
			`data = {`
			`"auth_token" : token,`
			`"login-subject": username,`
			`"password" : password,`
			`}`
			`response = self.request(url, method="POST", headers=headers, data=data)`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`if not response.history:`
			`raise exception.AuthenticationError()`
			`return self.session.cookies`

			`def _pagination(self, page, endpoint, params):`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`data = None`
[imgbb] improve pagination logic - avoid unnecessary API calls for small or empty galleries - combine duplicate code 2019-11-10 17:07:27 +01:00			`seek, pos = text.extract(page, 'data-seek="', '"')`
			`tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)`
			`params["action"] = "list"`
			`params["list"] = "images"`
			`params["sort"] = self.sort`
			`params["seek"] = seek`
			`params["page"] = 2`
			`params["auth_token"] = tokn`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
			`while True:`
			`for img in text.extract_iter(page, "data-object='", "'"):`
			`yield json.loads(text.unquote(img))`
			`if data:`
			`if params["seek"] == data["seekEnd"]:`
			`return`
			`params["seek"] = data["seekEnd"]`
			`params["page"] += 1`
[imgbb] improve pagination logic - avoid unnecessary API calls for small or empty galleries - combine duplicate code 2019-11-10 17:07:27 +01:00			`elif not seek or 'class="pagination-next"' not in page:`
			`return`
make 'method' argument of Extractor.request keyword-only 2019-11-05 17:28:09 +01:00			`data = self.request(endpoint, method="POST", data=params).json()`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`page = data["html"]`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00

			`class ImgbbAlbumExtractor(ImgbbExtractor):`
			`"""Extractor for albums on imgbb.com"""`
			`subcategory = "album"`
			`directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")`
			`pattern = r"(?:https?://)?ibb\.co/album/([^/?&#]+)/?(?:\?([^#]+))?"`
			`test = (`
[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`("https://ibb.co/album/i5PggF", {`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`"range": "1-80",`
[imgbb] update test results Image server domain changed from https://image.ibb.co/ to https://i.ibb.co/ 2020-03-01 20:38:25 +01:00			`"url": "70afec9fcc3a6de62a6b644b487d892d8d47cf1a",`
			`"keyword": "569e1d88ebdd27655387559cdf1cd526a3e1ab69",`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`}),`
[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`("https://ibb.co/album/i5PggF?sort=title_asc", {`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`"range": "1-80",`
[imgbb] update test results Image server domain changed from https://image.ibb.co/ to https://i.ibb.co/ 2020-03-01 20:38:25 +01:00			`"url": "a2dfc58fe3348fa37e242082bd5a85eaa490d0a5",`
			`"keyword": "5bb79c82411c3770d673fac64a0a98fa28111c3b",`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`}),`
[imgbb] fix error in galleries without user info (closes #471) 2019-11-10 17:10:51 +01:00			`# no user data (#471)`
			`("https://ibb.co/album/kYKpwF", {`
			`"url": "ac0abcfcb89f4df6adc2f7e4ff872f3b03ef1bc7",`
			`"keyword": {"user": ""},`
			`}),`
[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`# deleted`
			`("https://ibb.co/album/fDArrF", {`
			`"exception": exception.NotFoundError,`
			`}),`
			`# private`
			`("https://ibb.co/album/hqgWrF", {`
			`"exception": exception.HttpError,`
			`})`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`)`

			`def __init__(self, match):`
			`ImgbbExtractor.__init__(self, match)`
			`self.album_name = None`
			`self.album_id = match.group(1)`
			`self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")`
			`self.page_url = "https://ibb.co/album/" + self.album_id`

			`def metadata(self, page):`
			`album, pos = text.extract(page, '"og:title" content="', '"')`
			`user , pos = text.extract(page, 'rel="author">', '<', pos)`
			`return {`
			`"album_id" : self.album_id,`
			`"album_name": text.unescape(album),`
[imgbb] fix error in galleries without user info (closes #471) 2019-11-10 17:10:51 +01:00			`"user" : user.lower() if user else "",`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`}`

			`def images(self, page):`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00			`url = text.extract(page, '"og:url" content="', '"')[0]`
			`album_id = url.rpartition("/")[2].partition("?")[0]`

[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`return self._pagination(page, "https://ibb.co/json", {`
			`"from" : "album",`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00			`"albumid" : album_id,`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`"params_hidden[list]" : "images",`
			`"params_hidden[from]" : "album",`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00			`"params_hidden[albumid]": album_id,`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`})`


			`class ImgbbUserExtractor(ImgbbExtractor):`
			`"""Extractor for user profiles in imgbb.com"""`
			`subcategory = "user"`
			`pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"`
			`test = ("https://folkie.imgbb.com", {`
			`"range": "1-80",`
			`"pattern": r"https?://i\.ibb\.co/\w+/[^/?&#]+",`
			`})`

			`def __init__(self, match):`
			`ImgbbExtractor.__init__(self, match)`
			`self.user = match.group(1)`
			`self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")`
			`self.page_url = "https://{}.imgbb.com/".format(self.user)`

			`def metadata(self, page):`
			`return {"user": self.user}`

			`def images(self, page):`
[imgbb] improve pagination logic - avoid unnecessary API calls for small or empty galleries - combine duplicate code 2019-11-10 17:07:27 +01:00			`user = text.extract(page, '.obj.resource={"id":"', '"')[0]`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`return self._pagination(page, self.page_url + "json", {`
			`"from" : "user",`
			`"userid" : user,`
			`"params_hidden[userid]": user,`
			`"params_hidden[from]" : "user",`
			`})`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00

			`class ImgbbImageExtractor(ImgbbExtractor):`
			`subcategory = "image"`
			`pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)"`
[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`test = ("https://ibb.co/fUqh5b", {`
[imgbb] update test results Image server domain changed from https://image.ibb.co/ to https://i.ibb.co/ 2020-03-01 20:38:25 +01:00			`"pattern": r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg",`
[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`"content": "c5a0965178a8b357acd8aa39660092918c63795e",`
			`"keyword": {`
			`"id" : "fUqh5b",`
			`"title" : "Arundel Ireeman 5",`
[imgbb] update test results Image server domain changed from https://image.ibb.co/ to https://i.ibb.co/ 2020-03-01 20:38:25 +01:00			`"url" : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg",`
[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`"width" : 960,`
			`"height": 719,`
			`"user" : "folkie",`
			`"extension": "jpg",`
			`},`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00			`})`

			`def __init__(self, match):`
			`ImgbbExtractor.__init__(self, match)`
			`self.image_id = match.group(1)`

			`def items(self):`
			`url = "https://ibb.co/" + self.image_id`
			`extr = text.extract_from(self.request(url).text)`

			`image = {`
			`"id" : self.image_id,`
			`"title" : text.unescape(extr('"og:title" content="', '"')),`
			`"url" : extr('"og:image" content="', '"'),`
			`"width" : text.parse_int(extr('"og:image:width" content="', '"')),`
			`"height": text.parse_int(extr('"og:image:height" content="', '"')),`
			`"user" : extr('rel="author">', '<').lower(),`
			`}`
			`image["extension"] = text.ext_from_url(image["url"])`

			`yield Message.Version, 1`
			`yield Message.Directory, image`
			`yield Message.Url, image["url"], image`