1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-26 12:42:29 +01:00
gallery-dl/gallery_dl/extractor/imgbb.py

217 lines
7.5 KiB
Python
Raw Normal View History

2019-07-30 23:02:21 +02:00
# -*- coding: utf-8 -*-
# Copyright 2019-2023 Mike Fährmann
2019-07-30 23:02:21 +02:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://imgbb.com/"""
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
2019-07-30 23:02:21 +02:00
class ImgbbExtractor(Extractor):
"""Base class for imgbb extractors"""
2019-07-30 23:02:21 +02:00
category = "imgbb"
directory_fmt = ("{category}", "{user}")
filename_fmt = "{title} {id}.{extension}"
2019-07-30 23:02:21 +02:00
archive_fmt = "{id}"
root = "https://imgbb.com"
2019-07-30 23:02:21 +02:00
def __init__(self, match):
Extractor.__init__(self, match)
self.page_url = self.sort = None
2019-07-30 23:02:21 +02:00
def items(self):
self.login()
2020-04-20 23:36:57 +02:00
url = self.page_url
params = {"sort": self.sort}
while True:
response = self.request(url, params=params, allow_redirects=False)
if response.status_code < 300:
break
url = response.headers["location"]
if url.startswith(self.root):
raise exception.NotFoundError(self.subcategory)
page = response.text
data = self.metadata(page)
2019-07-30 23:02:21 +02:00
first = True
for img in self.images(page):
image = {
"id" : img["url_viewer"].rpartition("/")[2],
"user" : img["user"]["username"] if "user" in img else "",
"title" : text.unescape(img["title"]),
"url" : img["image"]["url"],
"extension": img["image"]["extension"],
"size" : text.parse_int(img["image"]["size"]),
"width" : text.parse_int(img["width"]),
"height" : text.parse_int(img["height"]),
}
image.update(data)
2019-07-30 23:02:21 +02:00
if first:
first = False
yield Message.Directory, data
yield Message.Url, image["url"], image
def login(self):
username, password = self._get_auth_info()
if username:
self.cookies_update(self._login_impl(username, password))
2019-07-30 23:02:21 +02:00
@cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
url = self.root + "/login"
2019-07-30 23:02:21 +02:00
page = self.request(url).text
token = text.extr(page, 'PF.obj.config.auth_token="', '"')
2019-07-30 23:02:21 +02:00
headers = {"Referer": url}
data = {
"auth_token" : token,
"login-subject": username,
"password" : password,
}
response = self.request(url, method="POST", headers=headers, data=data)
2019-07-30 23:02:21 +02:00
if not response.history:
raise exception.AuthenticationError()
return self.cookies
def _extract_resource(self, page):
return util.json_loads(text.extr(
page, "CHV.obj.resource=", "};") + "}")
def _extract_user(self, page):
return self._extract_resource(page).get("user") or {}
def _pagination(self, page, endpoint, params):
2019-07-30 23:02:21 +02:00
data = None
seek, pos = text.extract(page, 'data-seek="', '"')
tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
params["action"] = "list"
params["list"] = "images"
params["sort"] = self.sort
params["seek"] = seek
params["page"] = 2
params["auth_token"] = tokn
2019-07-30 23:02:21 +02:00
while True:
for img in text.extract_iter(page, "data-object='", "'"):
yield util.json_loads(text.unquote(img))
2019-07-30 23:02:21 +02:00
if data:
if not data["seekEnd"] or params["seek"] == data["seekEnd"]:
2019-07-30 23:02:21 +02:00
return
params["seek"] = data["seekEnd"]
params["page"] += 1
elif not seek or 'class="pagination-next"' not in page:
return
data = self.request(endpoint, method="POST", data=params).json()
2019-07-30 23:02:21 +02:00
page = data["html"]
class ImgbbAlbumExtractor(ImgbbExtractor):
"""Extractor for albums on imgbb.com"""
subcategory = "album"
directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")
pattern = r"(?:https?://)?ibb\.co/album/([^/?#]+)/?(?:\?([^#]+))?"
example = "https://ibb.co/album/ID"
def __init__(self, match):
ImgbbExtractor.__init__(self, match)
self.album_name = None
self.album_id = match.group(1)
self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
self.page_url = "https://ibb.co/album/" + self.album_id
def metadata(self, page):
album = text.extr(page, '"og:title" content="', '"')
user = self._extract_user(page)
return {
"album_id" : self.album_id,
"album_name" : text.unescape(album),
"user" : user.get("username") or "",
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
def images(self, page):
url = text.extr(page, '"og:url" content="', '"')
2020-04-20 23:36:57 +02:00
album_id = url.rpartition("/")[2].partition("?")[0]
return self._pagination(page, "https://ibb.co/json", {
"from" : "album",
2020-04-20 23:36:57 +02:00
"albumid" : album_id,
"params_hidden[list]" : "images",
"params_hidden[from]" : "album",
2020-04-20 23:36:57 +02:00
"params_hidden[albumid]": album_id,
})
class ImgbbUserExtractor(ImgbbExtractor):
"""Extractor for user profiles in imgbb.com"""
subcategory = "user"
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor.
2021-12-29 22:39:29 +01:00
pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$"
example = "https://USER.imgbb.com"
def __init__(self, match):
ImgbbExtractor.__init__(self, match)
self.user = match.group(1)
self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
self.page_url = "https://{}.imgbb.com/".format(self.user)
def metadata(self, page):
user = self._extract_user(page)
return {
"user" : user.get("username") or self.user,
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
def images(self, page):
user = text.extr(page, '.obj.resource={"id":"', '"')
return self._pagination(page, self.page_url + "json", {
"from" : "user",
"userid" : user,
"params_hidden[userid]": user,
"params_hidden[from]" : "user",
})
class ImgbbImageExtractor(ImgbbExtractor):
subcategory = "image"
pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?#]+)"
example = "https://ibb.co/ID"
def __init__(self, match):
ImgbbExtractor.__init__(self, match)
self.image_id = match.group(1)
def items(self):
url = "https://ibb.co/" + self.image_id
page = self.request(url).text
extr = text.extract_from(page)
user = self._extract_user(page)
image = {
"id" : self.image_id,
2023-10-09 15:29:17 +02:00
"title" : text.unescape(extr(
'"og:title" content="', ' hosted at ImgBB"')),
"url" : extr('"og:image" content="', '"'),
"width" : text.parse_int(extr('"og:image:width" content="', '"')),
"height": text.parse_int(extr('"og:image:height" content="', '"')),
"user" : user.get("username") or "",
"user_id" : user.get("id") or "",
"displayname": user.get("name") or "",
}
image["extension"] = text.ext_from_url(image["url"])
yield Message.Directory, image
yield Message.Url, image["url"], image