Date: Mon, 15 Jan 2024 18:24:47 +0100
Subject: [PATCH 72/77] [webtoons] small optimization
don't extract the entire 'author_area' and
avoid creating a second 'text.extract_from()' object
---
gallery_dl/extractor/webtoons.py | 9 +++++----
test/results/webtoons.py | 16 ++++++++++++++++
2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 1c7af470..a4259358 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -92,10 +92,11 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
title = extr('', '')
- aa_extr = text.extract_from(author_area)
- username = aa_extr('/creator/', '"')
- author_name = aa_extr('', '')
+ if extr('', '')
+ else:
+ username = author_name = ""
return {
"genre" : self.genre,
diff --git a/test/results/webtoons.py b/test/results/webtoons.py
index 9ca93446..82831f02 100644
--- a/test/results/webtoons.py
+++ b/test/results/webtoons.py
@@ -20,6 +20,22 @@ __tests__ = (
"42055e44659f6ffc410b3fb6557346dfbb993df3",
"49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9",
],
+
+ "author_name" : "Chris McCoy",
+ "comic" : "safely-endangered",
+ "comic_name" : "Safely Endangered",
+ "count" : 5,
+ "description" : "Silly comics for silly people.",
+ "episode" : "572",
+ "episode_name": "Ep. 572 - Earth",
+ "episode_no" : "572",
+ "genre" : "comedy",
+ "lang" : "en",
+ "language" : "English",
+ "num" : range(1, 5),
+ "title" : "Safely Endangered - Ep. 572 - Earth",
+ "title_no" : "352",
+ "username" : "safelyendangered",
},
{
From 4d6ec6958d29bd22739ba5fe27086e715d51fbc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Mon, 15 Jan 2024 22:37:33 +0100
Subject: [PATCH 73/77] [scripts] add 'push --force' to pull-request
---
scripts/pull-request | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/scripts/pull-request b/scripts/pull-request
index defdc11f..dea9b292 100755
--- a/scripts/pull-request
+++ b/scripts/pull-request
@@ -41,6 +41,10 @@ case "${2,,}" in
call git push "$USER" HEAD:"$BRANCH"
;;
+"pf"|"push-force")
+ call git push --force "$USER" HEAD:"$BRANCH"
+ ;;
+
"d"|"delete")
call git switch master
call git branch -D "$USER-$BRANCH"
From 3d68eda4abcfde18ecf377f140b8ad6ec4c2de6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Tue, 16 Jan 2024 00:24:30 +0100
Subject: [PATCH 74/77] [kemonoparty] add 'revision_hash' metadata (#4706,
#4727, #5013)
A SHA1 hexdigest of other relevant metadata fields like
title, content, file and attachment URLs.
This value does NOT reflect which revisions are listed on the website.
Neither does 'edited' or any other metadata field (combinations).
---
gallery_dl/extractor/kemonoparty.py | 26 ++++++++++++++++++++++----
test/results/kemonoparty.py | 2 ++
2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index c24e57d1..10228b5c 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -9,9 +9,10 @@
"""Extractors for https://kemono.party/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache, memcache
import itertools
+import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
@@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor):
Extractor.__init__(self, match)
def _init(self):
+ self.revisions = self.config("revisions")
self._prepare_ddosguard_cookies()
self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
+ self._json_dumps = json.JSONEncoder(
+ ensure_ascii=False, check_circular=False,
+ sort_keys=True, separators=(",", ":")).encode
def items(self):
find_hash = re.compile(HASH_PATTERN).match
@@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor):
idx = len(revs)
for rev in revs:
+ rev["revision_hash"] = self._revision_hash(rev)
rev["revision_index"] = idx
idx -= 1
return revs
+ def _revision_hash(self, revision):
+ rev = revision.copy()
+ rev.pop("revision_id", None)
+ rev.pop("added", None)
+ rev.pop("next", None)
+ rev.pop("prev", None)
+ rev["file"].pop("name", None)
+ for a in rev["attachments"]:
+ a.pop("name", None)
+ return util.sha1(self._json_dumps(rev))
+
def _validate(response):
return (response.headers["content-length"] != "9" or
@@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
url = self.api_url
params = text.parse_query(self.query)
params["o"] = text.parse_int(params.get("o"))
- revisions = self.config("revisions")
while True:
posts = self.request(url, params=params).json()
- if revisions:
+ if self.revisions:
for post in posts:
+ post["revision_hash"] = self._revision_hash(post)
post["revision_id"] = 0
post_url = "{}/post/{}".format(self.api_url, post["id"])
try:
@@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
def posts(self):
if not self.revision:
post = self.request(self.api_url).json()
- if self.config("revisions"):
+ if self.revisions:
+ post["revision_hash"] = self._revision_hash(post)
post["revision_id"] = 0
try:
revs = self._post_revisions(self.api_url)
diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py
index 5bd541a3..c3dbdf73 100644
--- a/test/results/kemonoparty.py
+++ b/test/results/kemonoparty.py
@@ -177,6 +177,7 @@ __tests__ = (
"revision_id": 142470,
"revision_index": 2,
+ "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40",
},
{
@@ -190,6 +191,7 @@ __tests__ = (
"revision_id": range(134996, 3052965),
"revision_index": range(1, 9),
+ "revision_hash": r"re:^[0-9a-f]{40}$",
},
From e33056adcd1469a80f1f7656848d1cf6cde5b3f6 Mon Sep 17 00:00:00 2001
From: Ailothaen
Date: Sun, 27 Feb 2022 19:40:15 +0100
Subject: [PATCH 75/77] [wikimedia] Add Wikipedia/Wikimedia extractor
---
gallery_dl/extractor/__init__.py | 1 +
gallery_dl/extractor/wikimedia.py | 172 ++++++++++++++++++++++++++++++
2 files changed, 173 insertions(+)
create mode 100644 gallery_dl/extractor/wikimedia.py
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 8e712961..86308917 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -178,6 +178,7 @@ modules = [
"weibo",
"wikiart",
"wikifeet",
+ "wikimedia",
"xhamster",
"xvideos",
"zerochan",
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
new file mode 100644
index 00000000..41cc1c9e
--- /dev/null
+++ b/gallery_dl/extractor/wikimedia.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022-2022 Ailothaen
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Wikimedia and Wikipedia.
+(Other Mediawiki instances use the same API,so a similar extractor
+could be written)
+
+Various reference:
+https://www.mediawiki.org/wiki/API:Query
+https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category
+"""
+
+from .common import Extractor, Message
+import time
+import re
+
+
+class WikimediaArticleExtractor(Extractor):
+ category = "wikimedia"
+ subcategory = "article"
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{filename}"
+ pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)"
+ directory_fmt = ("{category}", "{page}")
+ test = (
+ ("https://en.wikipedia.org/wiki/Athena"),
+ ("https://zh.wikipedia.org/wiki/太阳"),
+ ("https://simple.wikipedia.org/wiki/Hydrogen", {
+ "count": ">= 2"
+ })
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.lang, self.page = match.groups()
+
+ def items(self):
+ continuation = None
+ gimcontinuation = None
+
+ while True:
+ if continuation is None:
+ file_list_request = self.request(
+ "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa
+ lang=self.lang, page=self.page
+ )
+ )
+ else:
+ file_list_request = self.request(
+ "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa
+ lang=self.lang,
+ page=self.page,
+ continuation=continuation,
+ gimcontinuation=gimcontinuation,
+ )
+ )
+ file_list = file_list_request.json()
+
+ for file_index in list(file_list["query"]["pages"]):
+ image = file_list["query"]["pages"][file_index]["imageinfo"][0]
+
+ metadata = image
+ metadata["filename"] = WikimediaUtils.clean_name(
+ image["canonicaltitle"]
+ )[0]
+ metadata["extension"] = WikimediaUtils.clean_name(
+ image["canonicaltitle"]
+ )[1]
+
+ yield Message.Directory, {"page": self.page, "lang": self.lang}
+ yield Message.Url, image["url"], image
+ else:
+ # We arrived at the end of the response
+ # checking if there are more files to retrieve
+ try:
+ continuation_info = file_list["continue"]
+ except KeyError:
+ # No more continuation info: all files were retrieved
+ break
+ else:
+ # Continuation info is present
+ # there are still files to retrieve
+ continuation = continuation_info["continue"]
+ gimcontinuation = continuation_info["gimcontinue"]
+
+ # giving a rest to Wikipedia API
+ time.sleep(1)
+
+
+class WikimediaCategoryExtractor(Extractor):
+ category = "wikimedia"
+ subcategory = "category"
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{filename}"
+ pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)"
+ directory_fmt = ("{category}", "{page}")
+
+ test = (
+ ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa
+ ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa
+ "count": ">= 21"
+ })
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page = match.groups()[0]
+
+ def items(self):
+ continuation = None
+ gcmcontinuation = None
+
+ while True:
+ if continuation is None:
+ file_list_request = self.request(
+ "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa
+ page=self.page
+ )
+ )
+ else:
+ file_list_request = self.request(
+ "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa
+ page=self.page,
+ continuation=continuation,
+ gcmcontinuation=gcmcontinuation,
+ )
+ )
+ file_list = file_list_request.json()
+
+ for file_index in list(file_list["query"]["pages"]):
+ image = file_list["query"]["pages"][file_index]["imageinfo"][0]
+
+ metadata = image
+ metadata["filename"] = WikimediaUtils.clean_name(
+ image["canonicaltitle"]
+ )[0]
+ metadata["extension"] = WikimediaUtils.clean_name(
+ image["canonicaltitle"]
+ )[1]
+
+ yield Message.Directory, {"page": self.page, "lang": "common"}
+ yield Message.Url, image["url"], image
+ else:
+ # We arrived at the end of the response
+ # checking if there are more files to retrieve
+ try:
+ continuation_info = file_list["continue"]
+ except KeyError:
+ # No more continuation info: all files were retrieved
+ break
+ else:
+ # Continuation info is present
+ # there are still files to retrieve
+ continuation = continuation_info["continue"]
+ gcmcontinuation = continuation_info["gcmcontinue"]
+
+ # giving a rest to Wikipedia API
+ time.sleep(1)
+
+
+class WikimediaUtils:
+ @staticmethod
+ def clean_name(name):
+ name = re.sub(r"^\w+:", "", name)
+ filename = ".".join(name.split(".")[:-1])
+ extension = name.split(".")[-1]
+ return filename, extension
From 221f54309cf5437ad887e89a5c71d1a4263294d6 Mon Sep 17 00:00:00 2001
From: Ailothaen
Date: Mon, 25 Apr 2022 23:14:16 +0200
Subject: [PATCH 76/77] [wikimedia] Improved archive identifiers
---
gallery_dl/extractor/wikimedia.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index 41cc1c9e..a2ddfa2c 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -24,7 +24,7 @@ class WikimediaArticleExtractor(Extractor):
category = "wikimedia"
subcategory = "article"
filename_fmt = "{filename}.{extension}"
- archive_fmt = "{filename}"
+ archive_fmt = "a_{sha1}"
pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)"
directory_fmt = ("{category}", "{page}")
test = (
@@ -96,7 +96,7 @@ class WikimediaCategoryExtractor(Extractor):
category = "wikimedia"
subcategory = "category"
filename_fmt = "{filename}.{extension}"
- archive_fmt = "{filename}"
+ archive_fmt = "c_{sha1}"
pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)"
directory_fmt = ("{category}", "{page}")
From c3c1635ef35df7ef3f8884bd933578e79a2ade8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Tue, 16 Jan 2024 22:08:03 +0100
Subject: [PATCH 77/77] [wikimedia] update
- rewrite using BaseExtractor
- support most Wiki* domains
- update docs/supportedsites
- add tests
---
docs/supportedsites.md | 58 ++++++
gallery_dl/extractor/wikimedia.py | 284 ++++++++++++++----------------
scripts/supportedsites.py | 1 +
test/results/wikibooks.py | 23 +++
test/results/wikimediacommons.py | 23 +++
test/results/wikinews.py | 23 +++
test/results/wikipedia.py | 53 ++++++
test/results/wikiquote.py | 23 +++
test/results/wikisource.py | 23 +++
test/results/wikispecies.py | 25 +++
test/results/wikiversity.py | 23 +++
test/results/wiktionary.py | 23 +++
12 files changed, 426 insertions(+), 156 deletions(-)
create mode 100644 test/results/wikibooks.py
create mode 100644 test/results/wikimediacommons.py
create mode 100644 test/results/wikinews.py
create mode 100644 test/results/wikipedia.py
create mode 100644 test/results/wikiquote.py
create mode 100644 test/results/wikisource.py
create mode 100644 test/results/wikispecies.py
create mode 100644 test/results/wikiversity.py
create mode 100644 test/results/wiktionary.py
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 53c88335..d3d2a8a3 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW.
|