gallery-dl/gallery_dl/extractor/imgbb.py

# -*- coding: utf-8 -*-

# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://imgbb.com/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache


class ImgbbExtractor(Extractor):
    """Base class for imgbb extractors"""
    category = "imgbb"
    directory_fmt = ("{category}", "{user}")
    filename_fmt = "{title} {id}.{extension}"
    archive_fmt = "{id}"
    root = "https://imgbb.com"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.page_url = self.sort = None

    def items(self):
        self.login()

        url = self.page_url
        params = {"sort": self.sort}
        while True:
            response = self.request(url, params=params, allow_redirects=False)
            if response.status_code < 300:
                break
            url = response.headers["location"]
            if url.startswith(self.root):
                raise exception.NotFoundError(self.subcategory)

        page = response.text
        data = self.metadata(page)
        first = True

        for img in self.images(page):
            image = {
                "id"       : img["url_viewer"].rpartition("/")[2],
                "user"     : img["user"]["username"] if "user" in img else "",
                "title"    : text.unescape(img["title"]),
                "url"      : img["image"]["url"],
                "extension": img["image"]["extension"],
                "size"     : text.parse_int(img["image"]["size"]),
                "width"    : text.parse_int(img["width"]),
                "height"   : text.parse_int(img["height"]),
            }
            image.update(data)
            if first:
                first = False
                yield Message.Directory, data
            yield Message.Url, image["url"], image

    def login(self):
        username, password = self._get_auth_info()
        if username:
            self.cookies_update(self._login_impl(username, password))

    @cache(maxage=365*86400, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        url = self.root + "/login"
        page = self.request(url).text
        token = text.extr(page, 'PF.obj.config.auth_token="', '"')

        headers = {"Referer": url}
        data = {
            "auth_token"   : token,
            "login-subject": username,
            "password"     : password,
        }
        response = self.request(url, method="POST", headers=headers, data=data)

        if not response.history:
            raise exception.AuthenticationError()
        return self.cookies

    def _extract_resource(self, page):
        return util.json_loads(text.extr(
            page, "CHV.obj.resource=", "};") + "}")

    def _extract_user(self, page):
        return self._extract_resource(page).get("user") or {}

    def _pagination(self, page, endpoint, params):
        data = None
        seek, pos = text.extract(page, 'data-seek="', '"')
        tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
        params["action"] = "list"
        params["list"] = "images"
        params["sort"] = self.sort
        params["seek"] = seek
        params["page"] = 2
        params["auth_token"] = tokn

        while True:
            for img in text.extract_iter(page, "data-object='", "'"):
                yield util.json_loads(text.unquote(img))
            if data:
                if not data["seekEnd"] or params["seek"] == data["seekEnd"]:
                    return
                params["seek"] = data["seekEnd"]
                params["page"] += 1
            elif not seek or 'class="pagination-next"' not in page:
                return
            data = self.request(endpoint, method="POST", data=params).json()
            page = data["html"]


class ImgbbAlbumExtractor(ImgbbExtractor):
    """Extractor for albums on imgbb.com"""
    subcategory = "album"
    directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")
    pattern = r"(?:https?://)?ibb\.co/album/([^/?#]+)/?(?:\?([^#]+))?"
    example = "https://ibb.co/album/ID"

    def __init__(self, match):
        ImgbbExtractor.__init__(self, match)
        self.album_name = None
        self.album_id = match.group(1)
        self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
        self.page_url = "https://ibb.co/album/" + self.album_id

    def metadata(self, page):
        album = text.extr(page, '"og:title" content="', '"')
        user = self._extract_user(page)
        return {
            "album_id"   : self.album_id,
            "album_name" : text.unescape(album),
            "user"       : user.get("username") or "",
            "user_id"    : user.get("id") or "",
            "displayname": user.get("name") or "",
        }

    def images(self, page):
        url = text.extr(page, '"og:url" content="', '"')
        album_id = url.rpartition("/")[2].partition("?")[0]

        return self._pagination(page, "https://ibb.co/json", {
            "from"      : "album",
            "albumid"   : album_id,
            "params_hidden[list]"   : "images",
            "params_hidden[from]"   : "album",
            "params_hidden[albumid]": album_id,
        })


class ImgbbUserExtractor(ImgbbExtractor):
    """Extractor for user profiles in imgbb.com"""
    subcategory = "user"
    pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$"
    example = "https://USER.imgbb.com"

    def __init__(self, match):
        ImgbbExtractor.__init__(self, match)
        self.user = match.group(1)
        self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
        self.page_url = "https://{}.imgbb.com/".format(self.user)

    def metadata(self, page):
        user = self._extract_user(page)
        return {
            "user"       : user.get("username") or self.user,
            "user_id"    : user.get("id") or "",
            "displayname": user.get("name") or "",
        }

    def images(self, page):
        user = text.extr(page, '.obj.resource={"id":"', '"')
        return self._pagination(page, self.page_url + "json", {
            "from"      : "user",
            "userid"    : user,
            "params_hidden[userid]": user,
            "params_hidden[from]"  : "user",
        })


class ImgbbImageExtractor(ImgbbExtractor):
    subcategory = "image"
    pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?#]+)"
    example = "https://ibb.co/ID"

    def __init__(self, match):
        ImgbbExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def items(self):
        url = "https://ibb.co/" + self.image_id
        page = self.request(url).text
        extr = text.extract_from(page)
        user = self._extract_user(page)

        image = {
            "id"    : self.image_id,
            "title" : text.unescape(extr(
                '"og:title" content="', ' hosted at ImgBB"')),
            "url"   : extr('"og:image" content="', '"'),
            "width" : text.parse_int(extr('"og:image:width" content="', '"')),
            "height": text.parse_int(extr('"og:image:height" content="', '"')),
            "user"       : user.get("username") or "",
            "user_id"    : user.get("id") or "",
            "displayname": user.get("name") or "",
        }
        image["extension"] = text.ext_from_url(image["url"])

        yield Message.Directory, image
        yield Message.Url, image["url"], image
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`# -- coding: utf-8 --`

remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`# Copyright 2019-2023 Mike Fährmann`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://imgbb.com/"""`

			`from .common import Extractor, Message`
replace json.loads with direct calls to JSONDecoder.decode 2023-02-07 23:14:53 +01:00			`from .. import text, util, exception`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`from ..cache import cache`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00

[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`class ImgbbExtractor(Extractor):`
			`"""Base class for imgbb extractors"""`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`category = "imgbb"`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00			`directory_fmt = ("{category}", "{user}")`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`filename_fmt = "{title} {id}.{extension}"`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`archive_fmt = "{id}"`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`root = "https://imgbb.com"`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`self.page_url = self.sort = None`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
			`def items(self):`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`self.login()`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00
			`url = self.page_url`
			`params = {"sort": self.sort}`
			`while True:`
			`response = self.request(url, params=params, allow_redirects=False)`
			`if response.status_code < 300:`
			`break`
			`url = response.headers["location"]`
			`if url.startswith(self.root):`
			`raise exception.NotFoundError(self.subcategory)`

[imgbb] detect invalid album and user profile links and update test results, since the old album got deleted 2019-09-14 22:51:24 +02:00			`page = response.text`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`data = self.metadata(page)`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`first = True`

[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`for img in self.images(page):`
			`image = {`
			`"id" : img["url_viewer"].rpartition("/")[2],`
[imgbb] fix error in galleries without user info (closes #471) 2019-11-10 17:10:51 +01:00			`"user" : img["user"]["username"] if "user" in img else "",`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`"title" : text.unescape(img["title"]),`
			`"url" : img["image"]["url"],`
			`"extension": img["image"]["extension"],`
			`"size" : text.parse_int(img["image"]["size"]),`
			`"width" : text.parse_int(img["width"]),`
			`"height" : text.parse_int(img["height"]),`
			`}`
			`image.update(data)`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`if first:`
			`first = False`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`yield Message.Directory, data`
			`yield Message.Url, image["url"], image`

			`def login(self):`
			`username, password = self._get_auth_info()`
			`if username:`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`self.cookies_update(self._login_impl(username, password))`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
replace '24*3600' with '86400' and generalize cache maxage values 2023-12-18 23:19:44 +01:00			`@cache(maxage=365*86400, keyarg=1)`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`def _login_impl(self, username, password):`
			`self.log.info("Logging in as %s", username)`

			`url = self.root + "/login"`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`page = self.request(url).text`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`token = text.extr(page, 'PF.obj.config.auth_token="', '"')`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`headers = {"Referer": url}`
			`data = {`
			`"auth_token" : token,`
			`"login-subject": username,`
			`"password" : password,`
			`}`
			`response = self.request(url, method="POST", headers=headers, data=data)`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`if not response.history:`
			`raise exception.AuthenticationError()`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`return self.cookies`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`def _extract_resource(self, page):`
			`return util.json_loads(text.extr(`
			`page, "CHV.obj.resource=", "};") + "}")`

			`def _extract_user(self, page):`
			`return self._extract_resource(page).get("user") or {}`

[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`def _pagination(self, page, endpoint, params):`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`data = None`
[imgbb] improve pagination logic - avoid unnecessary API calls for small or empty galleries - combine duplicate code 2019-11-10 17:07:27 +01:00			`seek, pos = text.extract(page, 'data-seek="', '"')`
			`tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)`
			`params["action"] = "list"`
			`params["list"] = "images"`
			`params["sort"] = self.sort`
			`params["seek"] = seek`
			`params["page"] = 2`
			`params["auth_token"] = tokn`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00
			`while True:`
			`for img in text.extract_iter(page, "data-object='", "'"):`
replace json.loads with direct calls to JSONDecoder.decode 2023-02-07 23:14:53 +01:00			`yield util.json_loads(text.unquote(img))`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`if data:`
[imgbb] update pagination end condition (#4626) 2023-10-09 15:33:25 +02:00			`if not data["seekEnd"] or params["seek"] == data["seekEnd"]:`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`return`
			`params["seek"] = data["seekEnd"]`
			`params["page"] += 1`
[imgbb] improve pagination logic - avoid unnecessary API calls for small or empty galleries - combine duplicate code 2019-11-10 17:07:27 +01:00			`elif not seek or 'class="pagination-next"' not in page:`
			`return`
make 'method' argument of Extractor.request keyword-only 2019-11-05 17:28:09 +01:00			`data = self.request(endpoint, method="POST", data=params).json()`
[imgbb] add album extractor (#361) 2019-07-30 23:02:21 +02:00			`page = data["html"]`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00

			`class ImgbbAlbumExtractor(ImgbbExtractor):`
			`"""Extractor for albums on imgbb.com"""`
			`subcategory = "album"`
			`directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = r"(?:https?://)?ibb\.co/album/([^/?#]+)/?(?:\?([^#]+))?"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://ibb.co/album/ID"`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00
			`def __init__(self, match):`
			`ImgbbExtractor.__init__(self, match)`
			`self.album_name = None`
			`self.album_id = match.group(1)`
			`self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")`
			`self.page_url = "https://ibb.co/album/" + self.album_id`

			`def metadata(self, page):`
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`album = text.extr(page, '"og:title" content="', '"')`
			`user = self._extract_user(page)`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`return {`
[imgbb] Fix `user` extraction, add `displayname` 2023-10-05 12:18:58 +02:00			`"album_id" : self.album_id,`
			`"album_name" : text.unescape(album),`
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`"user" : user.get("username") or "",`
			`"user_id" : user.get("id") or "",`
			`"displayname": user.get("name") or "",`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`}`

			`def images(self, page):`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`url = text.extr(page, '"og:url" content="', '"')`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00			`album_id = url.rpartition("/")[2].partition("?")[0]`

[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`return self._pagination(page, "https://ibb.co/json", {`
			`"from" : "album",`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00			`"albumid" : album_id,`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`"params_hidden[list]" : "images",`
			`"params_hidden[from]" : "album",`
[imgbb] improve redirect handling 2020-04-20 23:36:57 +02:00			`"params_hidden[albumid]": album_id,`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`})`


			`class ImgbbUserExtractor(ImgbbExtractor):`
			`"""Extractor for user profiles in imgbb.com"""`
			`subcategory = "user"`
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 2021-12-29 22:39:29 +01:00			`pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://USER.imgbb.com"`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00
			`def __init__(self, match):`
			`ImgbbExtractor.__init__(self, match)`
			`self.user = match.group(1)`
			`self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")`
			`self.page_url = "https://{}.imgbb.com/".format(self.user)`

			`def metadata(self, page):`
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`user = self._extract_user(page)`
[imgbb] Fix `user` extraction, add `displayname` 2023-10-05 12:18:58 +02:00			`return {`
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`"user" : user.get("username") or self.user,`
			`"user_id" : user.get("id") or "",`
			`"displayname": user.get("name") or "",`
[imgbb] Fix `user` extraction, add `displayname` 2023-10-05 12:18:58 +02:00			`}`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00
			`def images(self, page):`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`user = text.extr(page, '.obj.resource={"id":"', '"')`
[imgbb] add user extractor + login support (#361) 2019-08-01 21:39:20 +02:00			`return self._pagination(page, self.page_url + "json", {`
			`"from" : "user",`
			`"userid" : user,`
			`"params_hidden[userid]": user,`
			`"params_hidden[from]" : "user",`
			`})`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00

			`class ImgbbImageExtractor(ImgbbExtractor):`
			`subcategory = "image"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?#]+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://ibb.co/ID"`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00
			`def __init__(self, match):`
			`ImgbbExtractor.__init__(self, match)`
			`self.image_id = match.group(1)`

			`def items(self):`
			`url = "https://ibb.co/" + self.image_id`
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`page = self.request(url).text`
			`extr = text.extract_from(page)`
			`user = self._extract_user(page)`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00
			`image = {`
			`"id" : self.image_id,`
[imgbb] fix flake8 and username order 2023-10-09 15:29:17 +02:00			`"title" : text.unescape(extr(`
			`'"og:title" content="', ' hosted at ImgBB"')),`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00			`"url" : extr('"og:image" content="', '"'),`
			`"width" : text.parse_int(extr('"og:image:width" content="', '"')),`
			`"height": text.parse_int(extr('"og:image:height" content="', '"')),`
[imgbb] update username extraction (#4626) 2023-10-14 20:55:39 +02:00			`"user" : user.get("username") or "",`
			`"user_id" : user.get("id") or "",`
			`"displayname": user.get("name") or "",`
[imgbb] add extractor for individual images (closes #363) 2019-08-05 22:52:08 +02:00			`}`
			`image["extension"] = text.ext_from_url(image["url"])`

			`yield Message.Directory, image`
			`yield Message.Url, image["url"], image`