From c3c1635ef35df7ef3f8884bd933578e79a2ade8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Jan 2024 22:08:03 +0100 Subject: [PATCH] [wikimedia] update - rewrite using BaseExtractor - support most Wiki* domains - update docs/supportedsites - add tests --- docs/supportedsites.md | 58 ++++++ gallery_dl/extractor/wikimedia.py | 284 ++++++++++++++---------------- scripts/supportedsites.py | 1 + test/results/wikibooks.py | 23 +++ test/results/wikimediacommons.py | 23 +++ test/results/wikinews.py | 23 +++ test/results/wikipedia.py | 53 ++++++ test/results/wikiquote.py | 23 +++ test/results/wikisource.py | 23 +++ test/results/wikispecies.py | 25 +++ test/results/wikiversity.py | 23 +++ test/results/wiktionary.py | 23 +++ 12 files changed, 426 insertions(+), 156 deletions(-) create mode 100644 test/results/wikibooks.py create mode 100644 test/results/wikimediacommons.py create mode 100644 test/results/wikinews.py create mode 100644 test/results/wikipedia.py create mode 100644 test/results/wikiquote.py create mode 100644 test/results/wikisource.py create mode 100644 test/results/wikispecies.py create mode 100644 test/results/wikiversity.py create mode 100644 test/results/wiktionary.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53c88335..d3d2a8a3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW. + + Wikimedia Instances + + + Wikipedia + https://www.wikipedia.org/ + Articles, Categories + + + + Wiktionary + https://www.wiktionary.org/ + Articles, Categories + + + + Wikiquote + https://www.wikiquote.org/ + Articles, Categories + + + + Wikibooks + https://www.wikibooks.org/ + Articles, Categories + + + + Wikisource + https://www.wikisource.org/ + Articles, Categories + + + + Wikinews + https://www.wikinews.org/ + Articles, Categories + + + + Wikiversity + https://www.wikiversity.org/ + Articles, Categories + + + + Wikispecies + https://species.wikimedia.org/ + Articles, Categories + + + + Wikimedia Commons + https://commons.wikimedia.org/ + Articles, Categories + + + Moebooru and MyImouto diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index a2ddfa2c..1a896515 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -1,172 +1,144 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2022 Ailothaen +# Copyright 2022 Ailothaen +# Copyright 2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Wikimedia and Wikipedia. -(Other Mediawiki instances use the same API,so a similar extractor -could be written) +"""Extractors for Wikimedia and Wikipedia""" -Various reference: -https://www.mediawiki.org/wiki/API:Query -https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category -""" - -from .common import Extractor, Message -import time -import re +from .common import BaseExtractor, Message +from .. import text -class WikimediaArticleExtractor(Extractor): - category = "wikimedia" +class WikimediaExtractor(BaseExtractor): + """Base class for wikimedia extractors""" + basecategory = "wikimedia" + directory_fmt = ("{category}", "{page}") + archive_fmt = "{sha1}" + request_interval = (1.0, 2.0) + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.title = match.group(match.lastindex) + + def items(self): + for info in self._pagination(self.params): + image = info["imageinfo"][0] + + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"]} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"]} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + image["page"] = self.title + + yield Message.Directory, image + yield Message.Url, image["url"], image + + def _pagination(self, params): + """ + https://www.mediawiki.org/wiki/API:Query + https://opendata.stackexchange.com/questions/13381 + """ + + url = self.root + "/w/api.php" + params["action"] = "query" + params["format"] = "json" + + while True: + data = self.request(url, params=params).json() + + try: + pages = data["query"]["pages"] + except KeyError: + pass + else: + yield from pages.values() + + try: + continuation = data["continue"] + except KeyError: + break + params.update(continuation) + + +BASE_PATTERN = WikimediaExtractor.update({ + "wikipedia": { + "root": None, + "pattern": r"[a-z]{2,}\.wikipedia\.org", + }, + "wiktionary": { + "root": None, + "pattern": r"[a-z]{2,}\.wiktionary\.org", + }, + "wikiquote": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiquote\.org", + }, + "wikibooks": { + "root": None, + "pattern": r"[a-z]{2,}\.wikibooks\.org", + }, + "wikisource": { + "root": None, + "pattern": r"[a-z]{2,}\.wikisource\.org", + }, + "wikinews": { + "root": None, + "pattern": r"[a-z]{2,}\.wikinews\.org", + }, + "wikiversity": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiversity\.org", + }, + "wikispecies": { + "root": "https://species.wikimedia.org", + "pattern": r"species\.wikimedia\.org", + }, + "wikimediacommons": { + "root": "https://commons.wikimedia.org", + "pattern": r"commons\.wikimedia\.org", + }, +}) + + +class WikimediaArticleExtractor(WikimediaExtractor): + """Extractor for wikimedia articles""" subcategory = "article" - filename_fmt = "{filename}.{extension}" - archive_fmt = "a_{sha1}" - pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" - directory_fmt = ("{category}", "{page}") - test = ( - ("https://en.wikipedia.org/wiki/Athena"), - ("https://zh.wikipedia.org/wiki/太阳"), - ("https://simple.wikipedia.org/wiki/Hydrogen", { - "count": ">= 2" - }) - ) + pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" + example = "https://en.wikipedia.org/wiki/TITLE" - def __init__(self, match): - Extractor.__init__(self, match) - self.lang, self.page = match.groups() - - def items(self): - continuation = None - gimcontinuation = None - - while True: - if continuation is None: - file_list_request = self.request( - "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa - lang=self.lang, page=self.page - ) - ) - else: - file_list_request = self.request( - "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa - lang=self.lang, - page=self.page, - continuation=continuation, - gimcontinuation=gimcontinuation, - ) - ) - file_list = file_list_request.json() - - for file_index in list(file_list["query"]["pages"]): - image = file_list["query"]["pages"][file_index]["imageinfo"][0] - - metadata = image - metadata["filename"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[0] - metadata["extension"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[1] - - yield Message.Directory, {"page": self.page, "lang": self.lang} - yield Message.Url, image["url"], image - else: - # We arrived at the end of the response - # checking if there are more files to retrieve - try: - continuation_info = file_list["continue"] - except KeyError: - # No more continuation info: all files were retrieved - break - else: - # Continuation info is present - # there are still files to retrieve - continuation = continuation_info["continue"] - gimcontinuation = continuation_info["gimcontinue"] - - # giving a rest to Wikipedia API - time.sleep(1) + def _init(self): + self.params = { + "generator": "images", + "titles" : self.title, + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } -class WikimediaCategoryExtractor(Extractor): - category = "wikimedia" +class WikimediaCategoryExtractor(WikimediaExtractor): subcategory = "category" - filename_fmt = "{filename}.{extension}" - archive_fmt = "c_{sha1}" - pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" - directory_fmt = ("{category}", "{page}") + pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)" + example = "https://commons.wikimedia.org/wiki/Category:NAME" - test = ( - ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa - ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa - "count": ">= 21" - }) - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page = match.groups()[0] - - def items(self): - continuation = None - gcmcontinuation = None - - while True: - if continuation is None: - file_list_request = self.request( - "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa - page=self.page - ) - ) - else: - file_list_request = self.request( - "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa - page=self.page, - continuation=continuation, - gcmcontinuation=gcmcontinuation, - ) - ) - file_list = file_list_request.json() - - for file_index in list(file_list["query"]["pages"]): - image = file_list["query"]["pages"][file_index]["imageinfo"][0] - - metadata = image - metadata["filename"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[0] - metadata["extension"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[1] - - yield Message.Directory, {"page": self.page, "lang": "common"} - yield Message.Url, image["url"], image - else: - # We arrived at the end of the response - # checking if there are more files to retrieve - try: - continuation_info = file_list["continue"] - except KeyError: - # No more continuation info: all files were retrieved - break - else: - # Continuation info is present - # there are still files to retrieve - continuation = continuation_info["continue"] - gcmcontinuation = continuation_info["gcmcontinue"] - - # giving a rest to Wikipedia API - time.sleep(1) - - -class WikimediaUtils: - @staticmethod - def clean_name(name): - name = re.sub(r"^\w+:", "", name) - filename = ".".join(name.split(".")[:-1]) - extension = name.split(".")[-1] - return filename, extension + def _init(self): + self.params = { + "generator": "categorymembers", + "gcmtitle" : self.title, + "gcmtype" : "file", + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index d3107b47..34566465 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -139,6 +139,7 @@ CATEGORY_MAP = { "webmshare" : "webmshare", "webtoons" : "Webtoon", "wikiart" : "WikiArt.org", + "wikimediacommons": "Wikimedia Commons", "xbunkr" : "xBunkr", "xhamster" : "xHamster", "xvideos" : "XVideos", diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py new file mode 100644 index 00000000..882741d5 --- /dev/null +++ b/test/results/wikibooks.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikibooks.org/wiki/Title", + "#category": ("wikimedia", "wikibooks", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikibooks.org/wiki/Category:Title", + "#category": ("wikimedia", "wikibooks", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py new file mode 100644 index 00000000..6cc03e34 --- /dev/null +++ b/test/results/wikimediacommons.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg", + "#category": ("wikimedia", "wikimediacommons", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", + "#category": ("wikimedia", "wikimediacommons", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikinews.py b/test/results/wikinews.py new file mode 100644 index 00000000..8a2af25e --- /dev/null +++ b/test/results/wikinews.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikinews.org/wiki/Title", + "#category": ("wikimedia", "wikinews", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikinews.org/wiki/Category:Title", + "#category": ("wikimedia", "wikinews", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py new file mode 100644 index 00000000..87499878 --- /dev/null +++ b/test/results/wikipedia.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikipedia.org/wiki/Title", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Athena", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#pattern" : r"https://upload.wikimedia.org/wikipedia/.+", + "#count" : range(50, 100), + + "bitdepth" : int, + "canonicaltitle": str, + "comment" : str, + "commonmetadata": dict, + "date" : "type:datetime", + "descriptionshorturl": str, + "descriptionurl": str, + "extension" : str, + "extmetadata" : dict, + "filename" : str, + "height" : int, + "metadata" : dict, + "mime" : r"re:image/\w+", + "page" : "Athena", + "sha1" : r"re:^[0-9a-f]{40}$", + "size" : int, + "timestamp" : str, + "url" : str, + "user" : str, + "userid" : int, + "width" : int, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Category:Physics", + "#category": ("wikimedia", "wikipedia", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py new file mode 100644 index 00000000..5e6fb321 --- /dev/null +++ b/test/results/wikiquote.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiquote.org/wiki/Title", + "#category": ("wikimedia", "wikiquote", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiquote.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiquote", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikisource.py b/test/results/wikisource.py new file mode 100644 index 00000000..afdee23e --- /dev/null +++ b/test/results/wikisource.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikisource.org/wiki/Title", + "#category": ("wikimedia", "wikisource", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikisource.org/wiki/Category:Title", + "#category": ("wikimedia", "wikisource", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py new file mode 100644 index 00000000..d455fbac --- /dev/null +++ b/test/results/wikispecies.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://species.wikimedia.org/wiki/Geranospiza", + "#category": ("wikimedia", "wikispecies", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : "https://upload.wikimedia.org/wikipedia/commons/0/01/Geranospiza_caerulescens.jpg", + "#sha1_content": "3a17c14b15489928e4154f826af1c42afb5a523e", +}, + +{ + "#url" : "https://species.wikimedia.org/wiki/Category:Names", + "#category": ("wikimedia", "wikispecies", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py new file mode 100644 index 00000000..58565f49 --- /dev/null +++ b/test/results/wikiversity.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiversity.org/wiki/Title", + "#category": ("wikimedia", "wikiversity", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiversity.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiversity", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py new file mode 100644 index 00000000..c7a016f5 --- /dev/null +++ b/test/results/wiktionary.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wiktionary.org/wiki/Word", + "#category": ("wikimedia", "wiktionary", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wiktionary.org/wiki/Category:Words", + "#category": ("wikimedia", "wiktionary", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +)