diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b8f58be2..bec1b554 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -133,6 +133,12 @@ Consider all listed sites to potentially be NSFW. Collections, Galleries, User Profiles + + Bilibili + https://www.bilibili.com/ + Articles, User Articles + + Bluesky https://bsky.app/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index f80b629a..90337021 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -30,6 +30,7 @@ modules = [ "batoto", "bbc", "behance", + "bilibili", "blogger", "bluesky", "boosty", diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py new file mode 100644 index 00000000..718bbf37 --- /dev/null +++ b/gallery_dl/extractor/bilibili.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.bilibili.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + + +class BilibiliExtractor(Extractor): + """Base class for bilibili extractors""" + category = "bilibili" + root = "https://www.bilibili.com" + + def _init(self): + self.api = BilibiliAPI(self) + + +class BilibiliUserArticlesExtractor(BilibiliExtractor): + """Extractor for a bilibili user's articles""" + subcategory = "user-articles" + pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" + example = "https://space.bilibili.com/12345/article" + + def items(self): + for article in self.api.user_articles(self.groups[0]): + article["_extractor"] = BilibiliArticleExtractor + url = "{}/opus/{}".format(self.root, article["opus_id"]) + yield Message.Queue, url, article + + +class BilibiliArticleExtractor(BilibiliExtractor): + """Extractor for a bilibili article""" + subcategory = "article" + pattern = (r"(?:https?://)?" + r"(?:t\.bilibili\.com|(?:www\.)?bilibili.com/opus)/(\d+)") + example = "https://www.bilibili.com/opus/12345" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + + def items(self): + article = self.api.article(self.groups[0]) + + # Flatten modules list + modules = {} + for module in article["detail"]["modules"]: + del module['module_type'] + modules.update(module) + article["detail"]["modules"] = modules + + article["username"] = modules["module_author"]["name"] + + pics = [] + for paragraph in modules['module_content']['paragraphs']: + if "pic" not in paragraph: + continue + + try: + pics.extend(paragraph["pic"]["pics"]) + except Exception: + pass + + article["count"] = len(pics) + yield Message.Directory, article + for article["num"], pic in enumerate(pics, 1): + url = pic["url"] + article.update(pic) + yield Message.Url, url, text.nameext_from_url(url, article) + + +class BilibiliAPI(): + def __init__(self, extractor): + self.extractor = extractor + + def _call(self, endpoint, params): + url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint + data = self.extractor.request(url, params=params).json() + + if data["code"] != 0: + self.extractor.log.debug("Server response: %s", data) + raise exception.StopExtraction("API request failed") + + return data + + def user_articles(self, user_id): + endpoint = "/opus/feed/space" + params = {"host_mid": user_id} + + while True: + data = self._call(endpoint, params) + + for item in data["data"]["items"]: + params["offset"] = item["opus_id"] + yield item + + if not data["data"]["has_more"]: + break + + def article(self, article_id): + url = "https://www.bilibili.com/opus/" + article_id + response = self.extractor.request(url) + return util.json_loads(text.extr( + response.text, "window.__INITIAL_STATE__=", "};") + "}") diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 406860c4..2d727170 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -195,6 +195,9 @@ SUBCATEGORY_MAP = { "artwork": "Artwork Listings", "collections": "", }, + "bilibili": { + "user-articles": "User Articles", + }, "bluesky": { "posts": "", }, diff --git a/test/results/bilibili.py b/test/results/bilibili.py new file mode 100644 index 00000000..c32095fd --- /dev/null +++ b/test/results/bilibili.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import bilibili + + +__tests__ = ( +{ + "#url" : "https://www.bilibili.com/opus/988425412565532689", + "#class": bilibili.BilibiliArticleExtractor, + "#urls" : ( + "http://i0.hdslb.com/bfs/new_dyn/311264c4dcf45261f7d7a7fe451b05b9405279279.png", + "http://i0.hdslb.com/bfs/new_dyn/b60d8bc6996529613d617443a12c0a93405279279.png", + "http://i0.hdslb.com/bfs/new_dyn/d4494543210d9eee5310e11dc62581e4405279279.png", + "http://i0.hdslb.com/bfs/new_dyn/45268e63086b2d99811b2e6490130937405279279.png", + ), + + "count" : 4, + "detail" : dict, + "extension": "png", + "filename" : str, + "height" : 800, + "id" : "988425412565532689", + "isClient" : False, + "isPreview": False, + "num" : range(1, 4), + "size" : float, + "theme" : str, + "themeMode": "light", + "url" : str, + "username" : "平平出击", + "width" : 800, +}, + +{ + "#url" : "https://space.bilibili.com/405279279/article", + "#class" : bilibili.BilibiliUserArticlesExtractor, + "#pattern": bilibili.BilibiliArticleExtractor.pattern, + "#count" : range(50, 100), +}, + +)