From 6eef3e3495d1f232d7d9bcfd75078e2d096dce68 Mon Sep 17 00:00:00 2001 From: hdk5 Date: Fri, 8 Nov 2024 23:17:48 +0200 Subject: [PATCH 1/3] [bilibili] initial support (#2824) --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bilibili.py | 102 +++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 gallery_dl/extractor/bilibili.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index f80b629a..90337021 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -30,6 +30,7 @@ modules = [ "batoto", "bbc", "behance", + "bilibili", "blogger", "bluesky", "boosty", diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py new file mode 100644 index 00000000..0be8284c --- /dev/null +++ b/gallery_dl/extractor/bilibili.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.bilibili.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +class BilibiliExtractor(Extractor): + """Base class for bilibili extractors""" + category = "bilibili" + root = "https://www.bilibili.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item = match.group(match.lastindex) + + def _init(self): + self.api = BilibiliAPI(self) + + +class BilibiliUserArticlesExtractor(BilibiliExtractor): + """Extractor for all articles of an user""" + subcategory = "user" + pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" + example = "https://space.bilibili.com/12345/article" + + def items(self): + for article in self.api.user_articles(self.item): + article["_extractor"] = BilibiliArticleExtractor + url = "{}/opus/{}".format(self.root, article["opus_id"]) + yield Message.Queue, url, article + + +class BilibiliArticleExtractor(BilibiliExtractor): + """Extractor for images from an article""" + subcategory = "article" + pattern = (r"(?:https?://)?" + r"(?:t\.bilibili\.com|(?:www\.)?bilibili.com/opus)/(\d+)") + example = "https://www.bilibili.com/opus/12345" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + + def items(self): + article = self.api.article(self.item) + article["username"] = article["modules"]["module_author"]["name"] + article["id"] = article["id_str"] + + dynamic_major = article["modules"]["module_dynamic"]["major"] + if dynamic_major["type"] == "MAJOR_TYPE_OPUS": + urls = [pic["url"] for pic in dynamic_major["opus"]["pics"]] + else: + urls = [] + self.log.warning("%s: Unsupported article type '%s'", + article["id"], dynamic_major["type"]) + + yield Message.Directory, article + for article["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, article) + + +class BilibiliAPI(): + def __init__(self, extractor: BilibiliExtractor): + self.extractor = extractor + + def _call(self, endpoint, params): + url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint + response = self.extractor.request(url, params=params).json() + + if response["code"] != 0: + raise exception.StopExtraction("API request failed") + + return response + + def user_articles(self, user_id): + endpoint = "/opus/feed/space" + params = {"host_mid": user_id} + + while True: + data = self._call(endpoint, params) + + for item in data["data"]["items"]: + params["offset"] = item["opus_id"] + yield item + + if not data["data"]["has_more"]: + break + + def article(self, article_id): + endpoint = "/detail" + params = { + "id": article_id, + "features": "itemOpusStyle", + } + return self._call(endpoint, params)["data"]["item"] From fc59e0fb14d23d5bffb0acd6520ee18ce0b114fc Mon Sep 17 00:00:00 2001 From: hdk5 Date: Sun, 10 Nov 2024 15:18:03 +0200 Subject: [PATCH 2/3] [bilibili] support large articles --- gallery_dl/extractor/bilibili.py | 35 +++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index 0be8284c..1bf57c3a 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -9,7 +9,7 @@ """Extractors for https://www.bilibili.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception class BilibiliExtractor(Extractor): @@ -50,16 +50,21 @@ class BilibiliArticleExtractor(BilibiliExtractor): def items(self): article = self.api.article(self.item) - article["username"] = article["modules"]["module_author"]["name"] - article["id"] = article["id_str"] - dynamic_major = article["modules"]["module_dynamic"]["major"] - if dynamic_major["type"] == "MAJOR_TYPE_OPUS": - urls = [pic["url"] for pic in dynamic_major["opus"]["pics"]] - else: - urls = [] - self.log.warning("%s: Unsupported article type '%s'", - article["id"], dynamic_major["type"]) + # Flatten modules list + modules = {} + for module in article["detail"]["modules"]: + del module['module_type'] + modules.update(module) + article["detail"]["modules"] = modules + + article["username"] = modules["module_author"]["name"] + + urls = [] + for paragraph in modules['module_content']['paragraphs']: + pics = paragraph.get('pic', {}).get('pics', []) + for pic in pics: + urls.append(pic['url']) yield Message.Directory, article for article["num"], url in enumerate(urls, 1): @@ -94,9 +99,7 @@ class BilibiliAPI(): break def article(self, article_id): - endpoint = "/detail" - params = { - "id": article_id, - "features": "itemOpusStyle", - } - return self._call(endpoint, params)["data"]["item"] + url = "https://www.bilibili.com/opus/{}".format(article_id) + response = self.extractor.request(url) + return util.json_loads(text.extr( + response.text, "window.__INITIAL_STATE__=", ";")) From 82d561e825f0668c3a054e8b3d5bf5a8315e5204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 10 Nov 2024 17:44:10 +0100 Subject: [PATCH 3/3] [bilibili] update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use self.groups[…] to access matched values - extract more metadata (count, width, height, size) - remove type hint - add tests - update docs/supportedsites --- docs/supportedsites.md | 6 +++++ gallery_dl/extractor/bilibili.py | 46 +++++++++++++++++--------------- scripts/supportedsites.py | 3 +++ test/results/bilibili.py | 45 +++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 22 deletions(-) create mode 100644 test/results/bilibili.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b8f58be2..bec1b554 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -133,6 +133,12 @@ Consider all listed sites to potentially be NSFW. Collections, Galleries, User Profiles + + Bilibili + https://www.bilibili.com/ + Articles, User Articles + + Bluesky https://bsky.app/ diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index 1bf57c3a..718bbf37 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann -# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. @@ -17,29 +15,25 @@ class BilibiliExtractor(Extractor): category = "bilibili" root = "https://www.bilibili.com" - def __init__(self, match): - Extractor.__init__(self, match) - self.item = match.group(match.lastindex) - def _init(self): self.api = BilibiliAPI(self) class BilibiliUserArticlesExtractor(BilibiliExtractor): - """Extractor for all articles of an user""" - subcategory = "user" + """Extractor for a bilibili user's articles""" + subcategory = "user-articles" pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" example = "https://space.bilibili.com/12345/article" def items(self): - for article in self.api.user_articles(self.item): + for article in self.api.user_articles(self.groups[0]): article["_extractor"] = BilibiliArticleExtractor url = "{}/opus/{}".format(self.root, article["opus_id"]) yield Message.Queue, url, article class BilibiliArticleExtractor(BilibiliExtractor): - """Extractor for images from an article""" + """Extractor for a bilibili article""" subcategory = "article" pattern = (r"(?:https?://)?" r"(?:t\.bilibili\.com|(?:www\.)?bilibili.com/opus)/(\d+)") @@ -49,7 +43,7 @@ class BilibiliArticleExtractor(BilibiliExtractor): archive_fmt = "{id}_{num}" def items(self): - article = self.api.article(self.item) + article = self.api.article(self.groups[0]) # Flatten modules list modules = {} @@ -60,29 +54,37 @@ class BilibiliArticleExtractor(BilibiliExtractor): article["username"] = modules["module_author"]["name"] - urls = [] + pics = [] for paragraph in modules['module_content']['paragraphs']: - pics = paragraph.get('pic', {}).get('pics', []) - for pic in pics: - urls.append(pic['url']) + if "pic" not in paragraph: + continue + try: + pics.extend(paragraph["pic"]["pics"]) + except Exception: + pass + + article["count"] = len(pics) yield Message.Directory, article - for article["num"], url in enumerate(urls, 1): + for article["num"], pic in enumerate(pics, 1): + url = pic["url"] + article.update(pic) yield Message.Url, url, text.nameext_from_url(url, article) class BilibiliAPI(): - def __init__(self, extractor: BilibiliExtractor): + def __init__(self, extractor): self.extractor = extractor def _call(self, endpoint, params): url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint - response = self.extractor.request(url, params=params).json() + data = self.extractor.request(url, params=params).json() - if response["code"] != 0: + if data["code"] != 0: + self.extractor.log.debug("Server response: %s", data) raise exception.StopExtraction("API request failed") - return response + return data def user_articles(self, user_id): endpoint = "/opus/feed/space" @@ -99,7 +101,7 @@ class BilibiliAPI(): break def article(self, article_id): - url = "https://www.bilibili.com/opus/{}".format(article_id) + url = "https://www.bilibili.com/opus/" + article_id response = self.extractor.request(url) return util.json_loads(text.extr( - response.text, "window.__INITIAL_STATE__=", ";")) + response.text, "window.__INITIAL_STATE__=", "};") + "}") diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 406860c4..2d727170 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -195,6 +195,9 @@ SUBCATEGORY_MAP = { "artwork": "Artwork Listings", "collections": "", }, + "bilibili": { + "user-articles": "User Articles", + }, "bluesky": { "posts": "", }, diff --git a/test/results/bilibili.py b/test/results/bilibili.py new file mode 100644 index 00000000..c32095fd --- /dev/null +++ b/test/results/bilibili.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import bilibili + + +__tests__ = ( +{ + "#url" : "https://www.bilibili.com/opus/988425412565532689", + "#class": bilibili.BilibiliArticleExtractor, + "#urls" : ( + "http://i0.hdslb.com/bfs/new_dyn/311264c4dcf45261f7d7a7fe451b05b9405279279.png", + "http://i0.hdslb.com/bfs/new_dyn/b60d8bc6996529613d617443a12c0a93405279279.png", + "http://i0.hdslb.com/bfs/new_dyn/d4494543210d9eee5310e11dc62581e4405279279.png", + "http://i0.hdslb.com/bfs/new_dyn/45268e63086b2d99811b2e6490130937405279279.png", + ), + + "count" : 4, + "detail" : dict, + "extension": "png", + "filename" : str, + "height" : 800, + "id" : "988425412565532689", + "isClient" : False, + "isPreview": False, + "num" : range(1, 4), + "size" : float, + "theme" : str, + "themeMode": "light", + "url" : str, + "username" : "平平出击", + "width" : 800, +}, + +{ + "#url" : "https://space.bilibili.com/405279279/article", + "#class" : bilibili.BilibiliUserArticlesExtractor, + "#pattern": bilibili.BilibiliArticleExtractor.pattern, + "#count" : range(50, 100), +}, + +)