diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 53c88335..d3d2a8a3 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW.
|
+
+ Wikimedia Instances |
+
+
+ Wikipedia |
+ https://www.wikipedia.org/ |
+ Articles, Categories |
+ |
+
+
+ Wiktionary |
+ https://www.wiktionary.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikiquote |
+ https://www.wikiquote.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikibooks |
+ https://www.wikibooks.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikisource |
+ https://www.wikisource.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikinews |
+ https://www.wikinews.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikiversity |
+ https://www.wikiversity.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikispecies |
+ https://species.wikimedia.org/ |
+ Articles, Categories |
+ |
+
+
+ Wikimedia Commons |
+ https://commons.wikimedia.org/ |
+ Articles, Categories |
+ |
+
+
Moebooru and MyImouto |
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index a2ddfa2c..1a896515 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -1,172 +1,144 @@
# -*- coding: utf-8 -*-
-# Copyright 2022-2022 Ailothaen
+# Copyright 2022 Ailothaen
+# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for Wikimedia and Wikipedia.
-(Other Mediawiki instances use the same API,so a similar extractor
-could be written)
+"""Extractors for Wikimedia and Wikipedia"""
-Various reference:
-https://www.mediawiki.org/wiki/API:Query
-https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category
-"""
-
-from .common import Extractor, Message
-import time
-import re
+from .common import BaseExtractor, Message
+from .. import text
-class WikimediaArticleExtractor(Extractor):
- category = "wikimedia"
+class WikimediaExtractor(BaseExtractor):
+ """Base class for wikimedia extractors"""
+ basecategory = "wikimedia"
+ directory_fmt = ("{category}", "{page}")
+ archive_fmt = "{sha1}"
+ request_interval = (1.0, 2.0)
+
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ self.title = match.group(match.lastindex)
+
+ def items(self):
+ for info in self._pagination(self.params):
+ image = info["imageinfo"][0]
+
+ image["metadata"] = {
+ m["name"]: m["value"]
+ for m in image["metadata"]}
+ image["commonmetadata"] = {
+ m["name"]: m["value"]
+ for m in image["commonmetadata"]}
+
+ filename = image["canonicaltitle"]
+ image["filename"], _, image["extension"] = \
+ filename.partition(":")[2].rpartition(".")
+ image["date"] = text.parse_datetime(
+ image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+ image["page"] = self.title
+
+ yield Message.Directory, image
+ yield Message.Url, image["url"], image
+
+ def _pagination(self, params):
+ """
+ https://www.mediawiki.org/wiki/API:Query
+ https://opendata.stackexchange.com/questions/13381
+ """
+
+ url = self.root + "/w/api.php"
+ params["action"] = "query"
+ params["format"] = "json"
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ try:
+ pages = data["query"]["pages"]
+ except KeyError:
+ pass
+ else:
+ yield from pages.values()
+
+ try:
+ continuation = data["continue"]
+ except KeyError:
+ break
+ params.update(continuation)
+
+
+BASE_PATTERN = WikimediaExtractor.update({
+ "wikipedia": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikipedia\.org",
+ },
+ "wiktionary": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wiktionary\.org",
+ },
+ "wikiquote": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikiquote\.org",
+ },
+ "wikibooks": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikibooks\.org",
+ },
+ "wikisource": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikisource\.org",
+ },
+ "wikinews": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikinews\.org",
+ },
+ "wikiversity": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikiversity\.org",
+ },
+ "wikispecies": {
+ "root": "https://species.wikimedia.org",
+ "pattern": r"species\.wikimedia\.org",
+ },
+ "wikimediacommons": {
+ "root": "https://commons.wikimedia.org",
+ "pattern": r"commons\.wikimedia\.org",
+ },
+})
+
+
+class WikimediaArticleExtractor(WikimediaExtractor):
+ """Extractor for wikimedia articles"""
subcategory = "article"
- filename_fmt = "{filename}.{extension}"
- archive_fmt = "a_{sha1}"
- pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)"
- directory_fmt = ("{category}", "{page}")
- test = (
- ("https://en.wikipedia.org/wiki/Athena"),
- ("https://zh.wikipedia.org/wiki/太阳"),
- ("https://simple.wikipedia.org/wiki/Hydrogen", {
- "count": ">= 2"
- })
- )
+ pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)"
+ example = "https://en.wikipedia.org/wiki/TITLE"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.lang, self.page = match.groups()
-
- def items(self):
- continuation = None
- gimcontinuation = None
-
- while True:
- if continuation is None:
- file_list_request = self.request(
- "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa
- lang=self.lang, page=self.page
- )
- )
- else:
- file_list_request = self.request(
- "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa
- lang=self.lang,
- page=self.page,
- continuation=continuation,
- gimcontinuation=gimcontinuation,
- )
- )
- file_list = file_list_request.json()
-
- for file_index in list(file_list["query"]["pages"]):
- image = file_list["query"]["pages"][file_index]["imageinfo"][0]
-
- metadata = image
- metadata["filename"] = WikimediaUtils.clean_name(
- image["canonicaltitle"]
- )[0]
- metadata["extension"] = WikimediaUtils.clean_name(
- image["canonicaltitle"]
- )[1]
-
- yield Message.Directory, {"page": self.page, "lang": self.lang}
- yield Message.Url, image["url"], image
- else:
- # We arrived at the end of the response
- # checking if there are more files to retrieve
- try:
- continuation_info = file_list["continue"]
- except KeyError:
- # No more continuation info: all files were retrieved
- break
- else:
- # Continuation info is present
- # there are still files to retrieve
- continuation = continuation_info["continue"]
- gimcontinuation = continuation_info["gimcontinue"]
-
- # giving a rest to Wikipedia API
- time.sleep(1)
+ def _init(self):
+ self.params = {
+ "generator": "images",
+ "titles" : self.title,
+ "prop" : "imageinfo",
+ "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
+ "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
+ }
-class WikimediaCategoryExtractor(Extractor):
- category = "wikimedia"
+class WikimediaCategoryExtractor(WikimediaExtractor):
subcategory = "category"
- filename_fmt = "{filename}.{extension}"
- archive_fmt = "c_{sha1}"
- pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)"
- directory_fmt = ("{category}", "{page}")
+ pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)"
+ example = "https://commons.wikimedia.org/wiki/Category:NAME"
- test = (
- ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa
- ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa
- "count": ">= 21"
- })
- )
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.page = match.groups()[0]
-
- def items(self):
- continuation = None
- gcmcontinuation = None
-
- while True:
- if continuation is None:
- file_list_request = self.request(
- "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa
- page=self.page
- )
- )
- else:
- file_list_request = self.request(
- "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa
- page=self.page,
- continuation=continuation,
- gcmcontinuation=gcmcontinuation,
- )
- )
- file_list = file_list_request.json()
-
- for file_index in list(file_list["query"]["pages"]):
- image = file_list["query"]["pages"][file_index]["imageinfo"][0]
-
- metadata = image
- metadata["filename"] = WikimediaUtils.clean_name(
- image["canonicaltitle"]
- )[0]
- metadata["extension"] = WikimediaUtils.clean_name(
- image["canonicaltitle"]
- )[1]
-
- yield Message.Directory, {"page": self.page, "lang": "common"}
- yield Message.Url, image["url"], image
- else:
- # We arrived at the end of the response
- # checking if there are more files to retrieve
- try:
- continuation_info = file_list["continue"]
- except KeyError:
- # No more continuation info: all files were retrieved
- break
- else:
- # Continuation info is present
- # there are still files to retrieve
- continuation = continuation_info["continue"]
- gcmcontinuation = continuation_info["gcmcontinue"]
-
- # giving a rest to Wikipedia API
- time.sleep(1)
-
-
-class WikimediaUtils:
- @staticmethod
- def clean_name(name):
- name = re.sub(r"^\w+:", "", name)
- filename = ".".join(name.split(".")[:-1])
- extension = name.split(".")[-1]
- return filename, extension
+ def _init(self):
+ self.params = {
+ "generator": "categorymembers",
+ "gcmtitle" : self.title,
+ "gcmtype" : "file",
+ "prop" : "imageinfo",
+ "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
+ "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
+ }
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
index d3107b47..34566465 100755
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -139,6 +139,7 @@ CATEGORY_MAP = {
"webmshare" : "webmshare",
"webtoons" : "Webtoon",
"wikiart" : "WikiArt.org",
+ "wikimediacommons": "Wikimedia Commons",
"xbunkr" : "xBunkr",
"xhamster" : "xHamster",
"xvideos" : "XVideos",
diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py
new file mode 100644
index 00000000..882741d5
--- /dev/null
+++ b/test/results/wikibooks.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wikibooks.org/wiki/Title",
+ "#category": ("wikimedia", "wikibooks", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wikibooks.org/wiki/Category:Title",
+ "#category": ("wikimedia", "wikibooks", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py
new file mode 100644
index 00000000..6cc03e34
--- /dev/null
+++ b/test/results/wikimediacommons.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg",
+ "#category": ("wikimedia", "wikimediacommons", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro",
+ "#category": ("wikimedia", "wikimediacommons", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikinews.py b/test/results/wikinews.py
new file mode 100644
index 00000000..8a2af25e
--- /dev/null
+++ b/test/results/wikinews.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wikinews.org/wiki/Title",
+ "#category": ("wikimedia", "wikinews", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wikinews.org/wiki/Category:Title",
+ "#category": ("wikimedia", "wikinews", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py
new file mode 100644
index 00000000..87499878
--- /dev/null
+++ b/test/results/wikipedia.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wikipedia.org/wiki/Title",
+ "#category": ("wikimedia", "wikipedia", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wikipedia.org/wiki/Athena",
+ "#category": ("wikimedia", "wikipedia", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+ "#pattern" : r"https://upload.wikimedia.org/wikipedia/.+",
+ "#count" : range(50, 100),
+
+ "bitdepth" : int,
+ "canonicaltitle": str,
+ "comment" : str,
+ "commonmetadata": dict,
+ "date" : "type:datetime",
+ "descriptionshorturl": str,
+ "descriptionurl": str,
+ "extension" : str,
+ "extmetadata" : dict,
+ "filename" : str,
+ "height" : int,
+ "metadata" : dict,
+ "mime" : r"re:image/\w+",
+ "page" : "Athena",
+ "sha1" : r"re:^[0-9a-f]{40}$",
+ "size" : int,
+ "timestamp" : str,
+ "url" : str,
+ "user" : str,
+ "userid" : int,
+ "width" : int,
+},
+
+{
+ "#url" : "https://en.wikipedia.org/wiki/Category:Physics",
+ "#category": ("wikimedia", "wikipedia", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py
new file mode 100644
index 00000000..5e6fb321
--- /dev/null
+++ b/test/results/wikiquote.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wikiquote.org/wiki/Title",
+ "#category": ("wikimedia", "wikiquote", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wikiquote.org/wiki/Category:Title",
+ "#category": ("wikimedia", "wikiquote", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikisource.py b/test/results/wikisource.py
new file mode 100644
index 00000000..afdee23e
--- /dev/null
+++ b/test/results/wikisource.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wikisource.org/wiki/Title",
+ "#category": ("wikimedia", "wikisource", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wikisource.org/wiki/Category:Title",
+ "#category": ("wikimedia", "wikisource", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py
new file mode 100644
index 00000000..d455fbac
--- /dev/null
+++ b/test/results/wikispecies.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://species.wikimedia.org/wiki/Geranospiza",
+ "#category": ("wikimedia", "wikispecies", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+ "#urls" : "https://upload.wikimedia.org/wikipedia/commons/0/01/Geranospiza_caerulescens.jpg",
+ "#sha1_content": "3a17c14b15489928e4154f826af1c42afb5a523e",
+},
+
+{
+ "#url" : "https://species.wikimedia.org/wiki/Category:Names",
+ "#category": ("wikimedia", "wikispecies", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py
new file mode 100644
index 00000000..58565f49
--- /dev/null
+++ b/test/results/wikiversity.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wikiversity.org/wiki/Title",
+ "#category": ("wikimedia", "wikiversity", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wikiversity.org/wiki/Category:Title",
+ "#category": ("wikimedia", "wikiversity", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)
diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py
new file mode 100644
index 00000000..c7a016f5
--- /dev/null
+++ b/test/results/wiktionary.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import wikimedia
+
+
+__tests__ = (
+{
+ "#url" : "https://www.wiktionary.org/wiki/Word",
+ "#category": ("wikimedia", "wiktionary", "article"),
+ "#class" : wikimedia.WikimediaArticleExtractor,
+},
+
+{
+ "#url" : "https://en.wiktionary.org/wiki/Category:Words",
+ "#category": ("wikimedia", "wiktionary", "category"),
+ "#class" : wikimedia.WikimediaCategoryExtractor,
+},
+
+)