From 74c225f94e967dbdc0cc919a541c82d28dffcb16 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 22:33:33 -0500 Subject: [PATCH 01/77] [bato] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bato.py | 113 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + test/results/bato.py | 65 ++++++++++++++++++ 5 files changed, 186 insertions(+) create mode 100644 gallery_dl/extractor/bato.py create mode 100644 test/results/bato.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8e4c59a1..6040cd47 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -97,6 +97,12 @@ Consider all listed sites to potentially be NSFW. Albums, Artwork Listings, Challenges, Followed Users, individual Images, Likes, Search Results, User Profiles + + Bato + https://bato.to + Chapters, Manga + + BBC https://bbc.co.uk/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 695b8b2a..99de2169 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,6 +24,7 @@ modules = [ "architizer", "artstation", "aryion", + "bato", "bbc", "behance", "blogger", diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py new file mode 100644 index 00000000..c34b74fc --- /dev/null +++ b/gallery_dl/extractor/bato.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bato.to and aliases (v3x only)""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +import re + +BASE_PATTERN = r"(?:https?://)?(?:bato\.to|dto\.to|batotoo\.com|wto\.to)" +MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" +CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" + +class BatoBase(): + """Base class for bato v3x extractors""" + category = "bato" + root = "https://bato.to" + +class BatoChapterExtractor(BatoBase, ChapterExtractor): + """Extractor for manga chapters from bato.to""" + pattern = BASE_PATTERN + "(" + MANGA_PATTERN + CHAPTER_PATTERN + ")" + # There are three possible patterns for a chapter + example = "https://bato.to/title/12345-manga-name-with-spaces/54212-ch_1.5" + example1 = "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" + example2 = "https://bato.to/title/12345/54212" + # v2x, not supported + example3 = "https://bato.to/chapter/54212" + + def __init__(self, match): + self.path = match.group(1) + ChapterExtractor.__init__(self, match, self.root + self.path) + + def metadata(self, page): + info, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + info = info.encode('latin-1').decode('utf-8').replace("\n", "") + + match = re.match( + r"(.+) - " + r"(?:Volume *(\d+) )?" + r"Chapter *([\d\.]+)", info) + manga, volume, chapter = match.groups() if match else ("", "", info) + chapter, sep, minor = chapter.partition(".") + title_container = text.extr(page, f'") + title = text.extr(title_container, "", "") + + return { + "manga" : text.unescape(manga), + "title" : text.unescape(title), + "author" : "", + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor": sep + minor, + } + + def images(self, page): + images_container = text.extr(page, 'pageOpts', ':[0,0]}"') + images_container = text.unescape(images_container) + + return [(url, None) for url in text.extract_iter(images_container, r'\"', r'\"')] + + +class BatoMangaExtractor(BatoBase, MangaExtractor): + """Extractor for manga from bato.to""" + reverse = False + chapterclass = BatoChapterExtractor + pattern = BASE_PATTERN + "(" + MANGA_PATTERN + "$" + ")" + # There are two possible patterns for a manga + example = "https://bato.to/title/12345-manga-name-with-spaces/" + example2 = "https://bato.to/title/12345/" + # v2x, not supported + example3 = "https://bato.to/series/12345/manga-name-with-space" + + def chapters(self, page): + data = {} + num_chapters, _ = text.extract(page, ">Chapters<", "") + num_chapters, _ = text.extract(num_chapters, r"", r"") + num_chapters = text.parse_int(num_chapters) + if num_chapters == 0: + raise exception.NotFoundError("chapter") + + manga, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") + data["manga"] = manga + + results = [] + for chapter_num in range(num_chapters): + chapter, _ = text.extract(page, f'
") + chapter += r"" # Add this back in so we can match the date + url, pos = text.extract(chapter, '') + title, _ = text.extract(title, r"", r"") + if title is None or title == "" or title == "": + title, _ = text.extract(chapter, ">", "", pos) + + date, _ = text.extract(chapter, "") + date, _ = text.extract(date, 'time="', '"') + + data["date"] = date + data["title"] = title + data["chapter"] = text.parse_int(chapter_major) + data["chapter_minor"] = sep + chapter_minor + + if url.startswith("/"): + url = self.root + url + results.append((url, data.copy())) + return results \ No newline at end of file diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 4839660d..e3738b8b 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -32,6 +32,7 @@ CATEGORY_MAP = { "atfbooru" : "ATFBooru", "b4k" : "arch.b4k.co", "baraag" : "baraag", + "bato" : "Bato", "bbc" : "BBC", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", diff --git a/test/results/bato.py b/test/results/bato.py new file mode 100644 index 00000000..18479f9a --- /dev/null +++ b/test/results/bato.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import bato +from gallery_dl import exception + +__tests__ = ( +{ + "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official/1681030-ch_8", + "#category": ("", "bato", "chapter"), + "#class" : bato.BatoChapterExtractor, + "#count" : 66, + + "manga" : "I Shall Master this Family! [Official]", + "title" : "Observing", + "chapter" : 8, +}, +{ + "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", + "#comment" : "volume (vol) in url", + "#category": ("", "bato", "chapter"), + "#class" : bato.BatoChapterExtractor, + "#count" : 7, + + "manga" : "86--EIGHTY-SIX (Official)", + "title" : "The Spearhead Squadron's Power", + "volume" : 1, + "chapter" : 5, +}, +{ + "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#count" : ">= 21", + + "manga" : "Futsutsuka na Akujo de wa Gozaimasu ga - Suuguu Chouso Torikae Den (Official)", +}, +{ + "#url" : "https://bato.to/title/104929-86-eighty-six-official", + "#comment" : "Manga with number in name", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#count" : ">= 18", + + "manga" : "86--EIGHTY-SIX (Official)", +}, +{ + "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", + "#comment" : "Non-English translation (Indonesian)", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#count" : ">= 29", + + "manga" : "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", +}, +{ + "#url" : "https://bato.to/title/134270-removed", + "#category": ("", "bato", "manga"), + "#class" : bato.BatoMangaExtractor, + "#exception": exception.NotFoundError +} +) From 663b8d789a183d6465a45530eb511511b2d3faf7 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:41:37 -0500 Subject: [PATCH 02/77] Fix linting --- gallery_dl/extractor/bato.py | 42 +++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index c34b74fc..320f6999 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -14,27 +14,32 @@ BASE_PATTERN = r"(?:https?://)?(?:bato\.to|dto\.to|batotoo\.com|wto\.to)" MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" + class BatoBase(): """Base class for bato v3x extractors""" category = "bato" root = "https://bato.to" + class BatoChapterExtractor(BatoBase, ChapterExtractor): """Extractor for manga chapters from bato.to""" pattern = BASE_PATTERN + "(" + MANGA_PATTERN + CHAPTER_PATTERN + ")" # There are three possible patterns for a chapter example = "https://bato.to/title/12345-manga-name-with-spaces/54212-ch_1.5" - example1 = "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" - example2 = "https://bato.to/title/12345/54212" + example2 = \ + "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" + example3 = "https://bato.to/title/12345/54212" # v2x, not supported - example3 = "https://bato.to/chapter/54212" + example4 = "https://bato.to/chapter/54212" def __init__(self, match): self.path = match.group(1) ChapterExtractor.__init__(self, match, self.root + self.path) def metadata(self, page): - info, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + info, _ = text.extract( + page, "", r" - Read Free Manga Online at Bato.To" + ) info = info.encode('latin-1').decode('utf-8').replace("\n", "") match = re.match( @@ -58,8 +63,10 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): def images(self, page): images_container = text.extr(page, 'pageOpts', ':[0,0]}"') images_container = text.unescape(images_container) - - return [(url, None) for url in text.extract_iter(images_container, r'\"', r'\"')] + return [ + (url, None) + for url in text.extract_iter(images_container, r"\"", r"\"") + ] class BatoMangaExtractor(BatoBase, MangaExtractor): @@ -80,28 +87,33 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): num_chapters = text.parse_int(num_chapters) if num_chapters == 0: raise exception.NotFoundError("chapter") - - manga, _ = text.extract(page, '', r' - Read Free Manga Online at Bato.To') + + manga, _ = text.extract( + page, "", r" - Read Free Manga Online at Bato.To" + ) manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") data["manga"] = manga - + results = [] for chapter_num in range(num_chapters): - chapter, _ = text.extract(page, f'
") - chapter += r"" # Add this back in so we can match the date + chapter, _ = text.extract( + page, f'
" + ) + chapter += r"" # so we can match the date url, pos = text.extract(chapter, '') + title, _ = text.extract( + chapter, f'" + ) title, _ = text.extract(title, r"", r"") if title is None or title == "" or title == "": title, _ = text.extract(chapter, ">", "", pos) date, _ = text.extract(chapter, "") date, _ = text.extract(date, 'time="', '"') - + data["date"] = date data["title"] = title data["chapter"] = text.parse_int(chapter_major) @@ -110,4 +122,4 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): if url.startswith("/"): url = self.root + url results.append((url, data.copy())) - return results \ No newline at end of file + return results From 9c1ce28f688b1173508b347a8d975bb7ae6b0743 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:44:27 -0500 Subject: [PATCH 03/77] [bato] Added mangatoto alias --- gallery_dl/extractor/bato.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index 320f6999..b82416d5 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -10,7 +10,8 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = r"(?:https?://)?(?:bato\.to|dto\.to|batotoo\.com|wto\.to)" +BASE_PATTERN = r"(?:https?://)?" \ + r"(?:bato\.to|dto\.to|batotoo\.com|wto\.to|mangatoto\.com)" MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" From 06ff1d3a3cfc0d9b1d1e84b8faf66e74f3d3aadc Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:47:30 -0500 Subject: [PATCH 04/77] Replace text.extract with extr --- gallery_dl/extractor/bato.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index b82416d5..c885f27b 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -38,7 +38,7 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): ChapterExtractor.__init__(self, match, self.root + self.path) def metadata(self, page): - info, _ = text.extract( + info = text.extr( page, "", r" - Read Free Manga Online at Bato.To" ) info = info.encode('latin-1').decode('utf-8').replace("\n", "") @@ -83,13 +83,13 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): def chapters(self, page): data = {} - num_chapters, _ = text.extract(page, ">Chapters<", "
") - num_chapters, _ = text.extract(num_chapters, r"", r"") + num_chapters = text.extr(page, ">Chapters<", "
") + num_chapters = text.extr(num_chapters, r"", r"") num_chapters = text.parse_int(num_chapters) if num_chapters == 0: raise exception.NotFoundError("chapter") - manga, _ = text.extract( + manga = text.extr( page, "", r" - Read Free Manga Online at Bato.To" ) manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") @@ -97,7 +97,7 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): results = [] for chapter_num in range(num_chapters): - chapter, _ = text.extract( + chapter = text.extr( page, f'
" ) chapter += r"" # so we can match the date @@ -105,15 +105,15 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): chapter_no = re.search(r"-ch_([\d\.]+)", url).group(1) chapter_major, sep, chapter_minor = chapter_no.partition(".") - title, _ = text.extract( + title = text.extr( chapter, f'" ) - title, _ = text.extract(title, r"", r"") + title = text.extr(title, r"", r"") if title is None or title == "" or title == "": title, _ = text.extract(chapter, ">", "", pos) - date, _ = text.extract(chapter, "") - date, _ = text.extract(date, 'time="', '"') + date = text.extr(chapter, "") + date = text.extr(date, 'time="', '"') data["date"] = date data["title"] = title From 2c3f171d653b91e2536a9829866a932f66f4f32c Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Tue, 26 Dec 2023 23:52:06 -0500 Subject: [PATCH 05/77] Fix python 3.5 linting issue --- gallery_dl/extractor/bato.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index c885f27b..87d6c3c6 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -49,8 +49,8 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): r"Chapter *([\d\.]+)", info) manga, volume, chapter = match.groups() if match else ("", "", info) chapter, sep, minor = chapter.partition(".") - title_container = text.extr(page, f'") - title = text.extr(title_container, "", "") + title_section = text.extr(page, '") + title = text.extr(title_section, "", "") return { "manga" : text.unescape(manga), From e348da7a06da77689320fcb565f5aa4dfb6c8bd1 Mon Sep 17 00:00:00 2001 From: Antonio Date: Thu, 21 Dec 2023 12:50:54 -0600 Subject: [PATCH 06/77] [poringa] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/poringa.py | 129 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 3 + test/results/poringa.py | 47 +++++++++++ 5 files changed, 186 insertions(+) create mode 100644 gallery_dl/extractor/poringa.py create mode 100644 test/results/poringa.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8e4c59a1..b538749b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -679,6 +679,12 @@ Consider all listed sites to potentially be NSFW. Posts, User Profiles + + Poringa + http://www.poringa.net/ + Posts Images, Search Results, User Profiles + + Porn Image https://porn-images-xxx.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 695b8b2a..9c684bc0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -122,6 +122,7 @@ modules = [ "pixnet", "plurk", "poipiku", + "poringa", "pornhub", "pornpics", "postmill", diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py new file mode 100644 index 00000000..e5e80d57 --- /dev/null +++ b/gallery_dl/extractor/poringa.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://www.poringa.net/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +import itertools + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net" + + +class PoringaExtractor(Extractor): + category = "poringa" + directory_fmt = ("{category}", "{user}", "{post_id}") + filename_fmt = "{post_id}_{title}_{filename}.{extension}" + archive_fmt = "{post_id}" + root = "http://www.poringa.net" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item = match.group(1) + self.__cookies = True + + def items(self): + for post_id in self.posts(): + url = "{}/posts/imagenes/{}".format(self.root, post_id) + + try: + page = self.request(url).text + except exception.HttpError as exc: + self.log.warning( + "Unable to fetch posts for '%s' (%s)", post_id, exc) + continue + + title, pos = text.extract( + page, 'property="og:title" content="', '"') + pos = page.index('
', '
') + for url in text.extract_iter( + main_post, + 'Please wait a few moments", 0, 600) < 0: + return response + self.sleep(5.0, "check") + + def _pagination(self, url, params): + for params["p"] in itertools.count(1): + page = self.request(url, params=params).text + + posts_ids = PoringaPostExtractor.pattern.findall(page) + posts_ids = list(dict.fromkeys(posts_ids)) + yield from posts_ids + + if len(posts_ids) < 19: + return + + +class PoringaPostExtractor(PoringaExtractor): + """Extractor for posts on poringa.net""" + subcategory = "post" + pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)/[a-zA-Z0-9_-]+\.html" + example = "http://www.poringa.net/posts/imagenes/12/TITLE.html" + + def posts(self): + return (self.item,) + + +class PoringaUserExtractor(PoringaExtractor): + subcategory = "user" + pattern = BASE_PATTERN + r"/([a-zA-Z0-9_-]+)$" + example = "http://www.poringa.net/USER" + + def posts(self): + url = "{}/buscar/".format(self.root) + params = {"q": text.unquote(self.item)} + return self._pagination(url, params) + + +class PoringaSearchExtractor(PoringaExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)" + example = "http://www.poringa.net/buscar/?q=QUERY" + + def posts(self): + url = self.root + "/buscar/" + params = {"q": text.unquote(self.item)} + return self._pagination(url, params) + + +@cache() +def _cookie_cache(): + return () diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 4839660d..2995a46f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -234,6 +234,9 @@ SUBCATEGORY_MAP = { "sketch": "Sketch", "work": "individual Images", }, + "poringa": { + "post": "Posts Images", + }, "pornhub": { "gifs": "", }, diff --git a/test/results/poringa.py b/test/results/poringa.py new file mode 100644 index 00000000..b6c4e95d --- /dev/null +++ b/test/results/poringa.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import poringa + + +__tests__ = ( +{ + "#url" : "http://www.poringa.net/posts/imagenes/3051081/Turrita-alto-ojete.html", + "#category": ("", "poringa", "post"), + "#class" : poringa.PoringaPostExtractor, + "#pattern" : r"http://www\.poringa\.net/posts/imagenes/3051081/[a-zA-Z0-9_-]+\.html", + + "post_id" : "3051081", + "title" : "turrita alto ojete...", + "user" : "vipower1top", +}, + +{ + "#url" : "http://www.poringa.net/posts/imagenes/3095554/Otra-culona-de-instagram.html", + "#category": ("", "poringa", "post"), + "#class" : poringa.PoringaPostExtractor, + "#pattern" : r"http://www\.poringa\.net/posts/imagenes/3095554/[a-zA-Z0-9_-]+\.html", + + "post_id" : "3095554", + "title" : "Otra culona de instagram", + "user" : "Expectro007", +}, + +{ + "#url" : "http://www.poringa.net/Expectro007", + "#category": ("", "poringa", "user"), + "#class" : poringa.PoringaUserExtractor, + "#pattern" : r"https?://img-[0-9]\.poringa\.net/poringa/img/[a-zA-Z0-9/{2}]{12}[a-zA-Z0-9-_]+/[a-zA-Z0-9-_]+\.jpg", +}, + +{ + "#url" : "http://www.poringa.net/buscar/?&q=yuslopez", + "#category": ("", "poringa", "search"), + "#class" : poringa.PoringaSearchExtractor, + "#pattern" : r"https?://img-[0-9]\.poringa\.net/poringa/img/[a-zA-Z0-9/{2}]{12}[a-zA-Z0-9-_]+/[a-zA-Z0-9-_]+\.jpg", +}, + +) From 375f2db4c28477ba71acd05b03ebae55502d0fe9 Mon Sep 17 00:00:00 2001 From: blankie Date: Thu, 28 Dec 2023 01:06:48 +1100 Subject: [PATCH 07/77] [pinterest] add count metadata field --- gallery_dl/extractor/pinterest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 4b263934..c46a5879 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -47,6 +47,7 @@ class PinterestExtractor(Extractor): carousel_data = pin.get("carousel_data") if carousel_data: + pin["count"] = len(carousel_data["carousel_slots"]) for num, slot in enumerate(carousel_data["carousel_slots"], 1): slot["media_id"] = slot.pop("id") pin.update(slot) @@ -65,7 +66,7 @@ class PinterestExtractor(Extractor): if videos or media.get("duration") is None: pin.update(media) - pin["num"] = 0 + pin["num"] = pin["count"] = 1 pin["media_id"] = "" url = media["url"] From f36dafad063c43dd0b86da9621eac8df9c53e0b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 28 Dec 2023 19:07:04 +0100 Subject: [PATCH 08/77] improve 'include' handling (#4982) - remove spaces when given as string - warn about invalid vales --- gallery_dl/extractor/common.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 9b010c59..0dd05ef2 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -526,12 +526,15 @@ class Extractor(): if include == "all": include = extractors elif isinstance(include, str): - include = include.split(",") + include = include.replace(" ", "").split(",") result = [(Message.Version, 1)] for category in include: - if category in extractors: + try: extr, url = extractors[category] + except KeyError: + self.log.warning("Invalid include '%s'", category) + else: result.append((Message.Queue, url, {"_extractor": extr})) return iter(result) From 35530255847a30fb0eb70da6bb1937ffbd33ef81 Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:07:41 -0500 Subject: [PATCH 09/77] Removed f-strings --- gallery_dl/extractor/bato.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index 87d6c3c6..082c5e0a 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -98,7 +98,9 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): results = [] for chapter_num in range(num_chapters): chapter = text.extr( - page, f'
" + page, + '
" ) chapter += r"" # so we can match the date url, pos = text.extract(chapter, '" + chapter, + '" ) title = text.extr(title, r"", r"") if title is None or title == "" or title == "": From f6ce870885a1df8dfed788c0c9c2cadee1c21f8f Mon Sep 17 00:00:00 2001 From: bug-assassin <7788433+bug-assassin@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:25:15 -0500 Subject: [PATCH 10/77] Better variable names --- gallery_dl/extractor/bato.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index 082c5e0a..d29a58bf 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -97,32 +97,33 @@ class BatoMangaExtractor(BatoBase, MangaExtractor): results = [] for chapter_num in range(num_chapters): - chapter = text.extr( + chapter_info = text.extr( page, '
" ) - chapter += r"" # so we can match the date - url, pos = text.extract(chapter, '" # so we can match the date + url, pos = text.extract(chapter_info, '" ) title = text.extr(title, r"", r"") if title is None or title == "" or title == "": - title, _ = text.extract(chapter, ">", "", pos) + title, _ = text.extract(chapter_info, ">", "", pos) - date = text.extr(chapter, "") + date = text.extr(chapter_info, "") date = text.extr(date, 'time="', '"') data["date"] = date data["title"] = title - data["chapter"] = text.parse_int(chapter_major) - data["chapter_minor"] = sep + chapter_minor + data["chapter"] = text.parse_int(chapt_major) + data["chapter_minor"] = sep + chapt_minor if url.startswith("/"): url = self.root + url From 085411f3f13d691f283f1d3fcfb99d80bbb19b29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 16:07:56 +0100 Subject: [PATCH 11/77] [rule34] recognize URLs with 'www' subdomain (#4984) --- gallery_dl/extractor/gelbooru_v02.py | 2 +- test/results/rule34.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 0864b9f6..0c8af3d5 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -168,7 +168,7 @@ INSTANCES = { }, "rule34": { "root": "https://rule34.xxx", - "pattern": r"rule34\.xxx", + "pattern": r"(?:www\.)?rule34\.xxx", "api_root": "https://api.rule34.xxx", }, "safebooru": { diff --git a/test/results/rule34.py b/test/results/rule34.py index ca90e511..f8fefa32 100644 --- a/test/results/rule34.py +++ b/test/results/rule34.py @@ -34,6 +34,13 @@ __tests__ = ( "#count" : 3, }, +{ + "#url" : "https://www.rule34.xxx/index.php?page=post&s=view&id=863", + "#comment" : "www subdomain", + "#category": ("gelbooru_v02", "rule34", "post"), + "#class" : gelbooru_v02.GelbooruV02PostExtractor, +}, + { "#url" : "https://rule34.xxx/index.php?page=post&s=view&id=863", "#category": ("gelbooru_v02", "rule34", "post"), From caceb14fc2802237f67eb2b70b31d2c34ec055a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 17:26:57 +0100 Subject: [PATCH 12/77] [tests] fail when a results file contains syntax errors or is otherwise not importable --- test/results/__init__.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/results/__init__.py b/test/results/__init__.py index 0fe87462..c54bea56 100644 --- a/test/results/__init__.py +++ b/test/results/__init__.py @@ -13,12 +13,8 @@ __directory__ = os.path.dirname(__file__) @functools.lru_cache(maxsize=None) def tests(name): - try: - module = __import__(name, globals(), None, (), 1) - return module.__tests__ - except Exception as exc: - print(exc) - return () + module = __import__(name, globals(), None, (), 1) + return module.__tests__ def all(): From 00d83d9588c3fa9ed6b753d0c6baa2dc90ce4a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 18:33:46 +0100 Subject: [PATCH 13/77] [rule34us] add fallback for 'video-cdn1' videos (#4985) --- gallery_dl/extractor/rule34us.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index 6439a225..cf70cccb 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -38,7 +38,11 @@ class Rule34usExtractor(BooruExtractor): "height" : extr(' x ', 'h'), "file_url": extr(' src="', '"'), } - post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + + url = post["file_url"] + if "//video-cdn1." in url: + post["_fallback"] = (url.replace("//video-cdn1.", "//video."),) + post["md5"] = url.rpartition("/")[2].partition(".")[0] tags = collections.defaultdict(list) for tag_type, tag_name in self._find_tags(page): From 9f21c839ad6f0312fea3868568cdcb3313d09a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 30 Dec 2023 20:37:09 +0100 Subject: [PATCH 14/77] [poringa] improvements and fixes - add 'num' and 'count' metadata fields - prevent crash for "private" posts - prevent crash when there's no 'main-info' - update tests --- gallery_dl/extractor/poringa.py | 47 ++++++++++++++++++++------------- test/results/poringa.py | 27 ++++++++++++------- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py index e5e80d57..0149d060 100644 --- a/gallery_dl/extractor/poringa.py +++ b/gallery_dl/extractor/poringa.py @@ -17,8 +17,8 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net" class PoringaExtractor(Extractor): category = "poringa" directory_fmt = ("{category}", "{user}", "{post_id}") - filename_fmt = "{post_id}_{title}_{filename}.{extension}" - archive_fmt = "{post_id}" + filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}" + archive_fmt = "{post_id}_{num}" root = "http://www.poringa.net" def __init__(self, match): @@ -31,36 +31,45 @@ class PoringaExtractor(Extractor): url = "{}/posts/imagenes/{}".format(self.root, post_id) try: - page = self.request(url).text + response = self.request(url) except exception.HttpError as exc: self.log.warning( "Unable to fetch posts for '%s' (%s)", post_id, exc) continue + if "/registro-login?" in response.url: + self.log.warning("Private post '%s'", post_id) + continue + + page = response.text title, pos = text.extract( page, 'property="og:title" content="', '"') - pos = page.index('
', '
') - for url in text.extract_iter( - main_post, - ' Date: Sat, 30 Dec 2023 22:25:59 +0100 Subject: [PATCH 15/77] [nijie] add 'count' metadata field https://github.com/mikf/gallery-dl/issues/146#issuecomment-1812849102 --- gallery_dl/extractor/nijie.py | 7 +++++-- test/results/horne.py | 3 +++ test/results/nijie.py | 7 +++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 57c31184..b9917057 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -55,9 +55,12 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): else: data["user_id"] = data["artist_id"] data["user_name"] = data["artist_name"] - yield Message.Directory, data - for num, url in enumerate(self._extract_images(image_id, page)): + urls = list(self._extract_images(image_id, page)) + data["count"] = len(urls) + + yield Message.Directory, data + for num, url in enumerate(urls): image = text.nameext_from_url(url, { "num": num, "url": "https:" + url, diff --git a/test/results/horne.py b/test/results/horne.py index 9058a481..f6bddba8 100644 --- a/test/results/horne.py +++ b/test/results/horne.py @@ -83,6 +83,7 @@ __tests__ = ( "artist_id" : 58000, "artist_name": "のえるわ", + "count" : 1, "date" : "dt:2018-01-29 14:25:39", "description": "前回とシチュがまるかぶり \r\n竿野郎は塗るのだるかった", "extension" : "png", @@ -113,9 +114,11 @@ __tests__ = ( "artist_id" : 58000, "artist_name": "のえるわ", + "count" : 4, "date" : "dt:2018-02-04 14:47:24", "description": "ノエル「そんなことしなくても、言ってくれたら咥えるのに・・・♡」", "image_id" : 8716, + "num" : range(0, 3), "tags" : [ "男の娘", "フェラ", diff --git a/test/results/nijie.py b/test/results/nijie.py index 01ac8fac..a2c05c81 100644 --- a/test/results/nijie.py +++ b/test/results/nijie.py @@ -31,12 +31,13 @@ __tests__ = ( "artist_id" : 44, "artist_name": "ED", + "count" : 1, "date" : datetime.datetime, "description": str, "extension" : "jpg", "filename" : str, "image_id" : int, - "num" : int, + "num" : 0, "tags" : list, "title" : str, "url" : r"re:https://pic.nijie.net/\d+/nijie/.*jpg$", @@ -102,11 +103,12 @@ __tests__ = ( "#class" : nijie.NijieImageExtractor, "#urls" : "https://pic.nijie.net/06/nijie/14/44/44/illust/0_0_28e8c02d921bee33_9222d3.jpg", "#sha1_url" : "3d654e890212ba823c9647754767336aebc0a743", - "#sha1_metadata": "41da5d0e178b04f01fe72460185df52fadc3c91b", + "#sha1_metadata": "58e716bcb03b431cae901178c198c787908e1c0c", "#sha1_content" : "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", "artist_id" : 44, "artist_name": "ED", + "count" : 1, "date" : "dt:2014-01-18 19:58:21", "description": "租絵にてお邪魔いたし候\r\n是非ともこの”おっぱい”をご高覧賜りたく馳せ参じた次第\r\n長文にて失礼仕る\r\n\r\nまず全景でありますが、首を右に傾けてみて頂きたい\r\nこの絵図は茶碗を眺めていた私が思わぬ美しさにて昇天したときのものを、筆をとり、したためたものである(トレースではない)\r\n筆は疾風の如く走り、半刻過ぎには私好みの”おっぱい”になっていたのである!\r\n次に細部をみて頂きたい\r\n絵図を正面から見直して頂くと、なんとはんなりと美しいお椀型をしたおっぱいであろうか  右手から緩やかに生まれる曲線は左手に進むにつれて、穏やかな歪みを含み流れる  これは所謂轆轤目であるが三重の紐でおっぱいをぐるぐると巻きつけた情景そのままであり、この歪みから茶碗の均整は崩れ、たぷんたぷんのおっぱいの重量感を醸し出している!\r\nさらに左手に進めば梅花皮(カイラギ)を孕んだ高大が現れる 今回は点線にて表現するが、その姿は乳首から母乳が噴出するが如く 或は精子をぶっかけられたが如く 白くとろっとした釉薬の凝固が素晴しい景色をつくりだしているのである!\r\n最後には極めつけ、すくっと螺旋を帯びながらそそり立つ兜巾(ときん)!この情景はまさしく乳首である!  全体をふんわりと盛り上げさせる乳輪にちょこっと存在する乳頭はぺろりと舌で確かめ勃起させたくなる風情がある!\r\n\r\nこれを”おっぱい”と呼ばずなんと呼ぼうや!?\r\n\r\n興奮のあまり失礼致した\r\n御免", "extension" : "jpg", @@ -133,6 +135,7 @@ __tests__ = ( "artist_id" : 49509, "artist_name": "黒川 竜", + "count" : 4, "date" : "dt:2023-12-02 04:19:29", "description": "【DLサイトコム】ウィンターセール 30%OFF\r\n期間:2024年2月14日まで\r\n【toloveるドリンク】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ042727.html\r\n【toloveるドリンク2】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ043289.html\r\n【クランクランBIG】\r\nhttps://www.dlsite.com/maniax/work/=/product_id/RJ043564.html", "image_id" : 594044, From fe2147b3efe580f5b44bbab30f4d2ef30be7ea92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 31 Dec 2023 01:24:12 +0100 Subject: [PATCH 16/77] [docs] document 'write-pages' (#4543) --- docs/configuration.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 2a9029ed..de180973 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1066,6 +1066,25 @@ Description after a colon ``:``, for example ``{date:%Y%m%d}``. +extractor.*.write-pages +----------------------- +Type + * ``bool`` + * ``string`` +Default + ``false`` +Description + During data extraction, + write received HTTP request data + to enumerated files in the current working directory. + + Special values: + + * ``"all"``: Include HTTP request and response headers. Hide ``Authorization``, ``Cookie``, and ``Set-Cookie`` values. + * ``"ALL"``: Include all HTTP request and response headers. + + + Extractor-specific Options ========================== From 27d5fc3697f31f50a1a78df5224d42e8b43e9c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Jan 2024 16:08:10 +0100 Subject: [PATCH 17/77] [docs] document 'tls12' (#4543) https://github.com/mikf/gallery-dl/issues/4760#issuecomment-1793345940 --- docs/configuration.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index de180973..e922f813 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -627,6 +627,20 @@ Description `ssl.SSLContext.set_ciphers() `__ +extractor.*.tls12 +----------------- +Type + ``bool`` +Default + * ``true`` + * ``false`` for ``patreon``, ``pixiv:series`` +Description + Allow selecting TLS 1.2 cipher suites. + + Can be disabled to alter TLS fingerprints + and potentially bypass Cloudflare blocks. + + extractor.*.keywords -------------------- Type From 7aa1c9671baf6193cadc878c6a44477e8575dbc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Jan 2024 02:51:34 +0100 Subject: [PATCH 18/77] [tests] fix 'invalid escape sequence' warnings --- test/results/4plebs.py | 2 +- test/results/imgbb.py | 2 +- test/results/paheal.py | 2 +- test/results/raddle.py | 4 ++-- test/results/wikiart.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/results/4plebs.py b/test/results/4plebs.py index bae62608..affe14d8 100644 --- a/test/results/4plebs.py +++ b/test/results/4plebs.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://archive.4plebs.org/tg/thread/54059290", "#category": ("foolfuuka", "4plebs", "thread"), "#class" : foolfuuka.FoolfuukaThreadExtractor, - "#pattern" : "https://i\.4pcdn\.org/tg/1[34]\d{11}\.(jpg|png|gif)", + "#pattern" : r"https://i\.4pcdn\.org/tg/1[34]\d{11}\.(jpg|png|gif)", "#count" : 30, }, diff --git a/test/results/imgbb.py b/test/results/imgbb.py index b2351d0f..e2d1bc33 100644 --- a/test/results/imgbb.py +++ b/test/results/imgbb.py @@ -21,7 +21,7 @@ __tests__ = ( "album_id" : "i5PggF", "album_name" : "British Scrap Book", "extension" : "jpg", - "id" : "re:^\w{7}$", + "id" : r"re:^\w{7}$", "title" : str, "url" : r"re:https://i\.ibb\.co/\w{7}/[\w-]+\.jpg", "user" : "folkie", diff --git a/test/results/paheal.py b/test/results/paheal.py index 1772593b..46b210f6 100644 --- a/test/results/paheal.py +++ b/test/results/paheal.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://rule34.paheal.net/post/list/Ayane_Suzuki/1", "#category": ("shimmie2", "paheal", "tag"), "#class" : paheal.PahealTagExtractor, - "#pattern" : "https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20|https://r34i\.paheal-cdn\.net/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}$", + "#pattern" : r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20|https://r34i\.paheal-cdn\.net/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}$", "#count" : range(70, 200), "date" : "type:datetime", diff --git a/test/results/raddle.py b/test/results/raddle.py index 4e60abb7..0c9de429 100644 --- a/test/results/raddle.py +++ b/test/results/raddle.py @@ -21,7 +21,7 @@ __tests__ = ( "#category": ("postmill", "raddle.me", "forum"), "#class" : postmill.PostmillForumExtractor, "#count" : 1, - "#pattern" : "^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", + "#pattern" : r"^https://raddle\.me/f/traa/156646/click-here-to-go-to-f-traaaaaaannnnnnnnnns$", }, { @@ -97,7 +97,7 @@ __tests__ = ( "#comment" : "Link + text post (with text disabled)", "#category": ("postmill", "raddle.me", "post"), "#class" : postmill.PostmillPostExtractor, - "#pattern" : "^https://fantasyanime\.com/anime/neo-tokyo-dub$", + "#pattern" : r"^https://fantasyanime\.com/anime/neo-tokyo-dub$", "#count" : 1, }, diff --git a/test/results/wikiart.py b/test/results/wikiart.py index 47eb3ec7..9ab13103 100644 --- a/test/results/wikiart.py +++ b/test/results/wikiart.py @@ -12,7 +12,7 @@ __tests__ = ( "#url" : "https://www.wikiart.org/en/thomas-cole", "#category": ("", "wikiart", "artist"), "#class" : wikiart.WikiartArtistExtractor, - "#pattern" : "https://uploads\d+\.wikiart\.org/(\d+/)?images/thomas-cole/[\w()-]+\.(jpg|png)", + "#pattern" : r"https://uploads\d+\.wikiart\.org/(\d+/)?images/thomas-cole/[\w()-]+\.(jpg|png)", "#count" : "> 100", "albums" : None, From 63f649cd92a1aa2e70df8a2edbdb180ccae49f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Jan 2024 17:38:32 +0100 Subject: [PATCH 19/77] [idolcomplex] fix extraction & update URL patterns (#5002) --- gallery_dl/extractor/idolcomplex.py | 17 ++++++++------ test/results/idolcomplex.py | 36 ++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index b9e2c3dd..f70a948c 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -34,8 +34,11 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_post = 0 def _init(self): + self.find_pids = re.compile( + r" href=[\"#]/\w\w/posts/([0-9a-f]+)" + ).findall self.find_tags = re.compile( - r'tag-type-([^"]+)">\s*
]+>\s*\s*]*?href="/[^?]*\?tags=([^"]+)' ).findall def items(self): @@ -149,8 +152,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" - example = "https://idol.sankakucomplex.com/?tags=TAGS" + pattern = BASE_PATTERN + r"/(?:posts/?)?\?([^#]*)" + example = "https://idol.sankakucomplex.com/en/posts?tags=TAGS" per_page = 20 def __init__(self, match): @@ -196,7 +199,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): page = self.request(self.root, params=params, retries=10).text pos = ((page.find('id="more-popular-posts-link"') + 1) or (page.find(' Date: Mon, 1 Jan 2024 22:05:21 +0100 Subject: [PATCH 20/77] [manganelo] fix extraction & recognize '.to' TLDs (#5005) --- gallery_dl/extractor/manganelo.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 46019ad8..232b98d4 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -10,7 +10,11 @@ from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" +BASE_PATTERN = ( + r"(?:https?://)?" + r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o" + r"\.(?:to|com))" +) class ManganeloBase(): @@ -67,10 +71,11 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): def images(self, page): page = text.extr( - page, 'class="container-chapter-reader', '\n Date: Mon, 1 Jan 2024 22:58:42 +0100 Subject: [PATCH 21/77] [twitter] raise error for invalid 'strategy' values (#4953) --- gallery_dl/extractor/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index fdcefddc..aa9ab9f6 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -552,9 +552,11 @@ class TwitterTimelineExtractor(TwitterExtractor): return self.api.user_media if strategy == "tweets": return self.api.user_tweets + if strategy == "media": + return self.api.user_media if strategy == "with_replies": return self.api.user_tweets_and_replies - return self.api.user_media + raise exception.StopExtraction("Invalid strategy '%s'", strategy) class TwitterTweetsExtractor(TwitterExtractor): From ee65f3de437b2d782d4d05765ecf93aa6ce19387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 Jan 2024 15:03:04 +0100 Subject: [PATCH 22/77] [docs] add parent>child example (#4621) --- docs/gallery-dl-example.conf | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index c3f80493..cda584e3 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -176,16 +176,15 @@ "imgur": { - "#": "use different directory and filename formats when coming from a reddit post", - "directory": - { - "'_reddit' in locals()": [] - }, - "filename": - { - "'_reddit' in locals()": "{_reddit[id]} {id}.{extension}", - "" : "{id}.{extension}" - } + "#": "general imgur settings", + "filename": "{id}.{extension}" + }, + + "reddit>imgur": + { + "#": "special settings for imgur URLs found in reddit posts", + "directory": [], + "filename": "{_reddit[id]} {_reddit[title]} {id}.{extension}" }, "tumblr": From 4f3671458efc2d4f91baf31d2a1cfc54055872c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 2 Jan 2024 23:45:59 +0100 Subject: [PATCH 23/77] [deviantart] add 'avatar' and 'background' extractors (#4995) --- docs/configuration.rst | 8 ++- docs/supportedsites.md | 2 +- gallery_dl/extractor/deviantart.py | 56 ++++++++++++++-- test/results/deviantart.py | 100 ++++++++++++++++++++++++++++- 4 files changed, 154 insertions(+), 12 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index e922f813..cbc54a7d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1401,7 +1401,13 @@ Description when processing a user profile. Possible values are - ``"gallery"``, ``"scraps"``, ``"journal"``, ``"favorite"``, ``"status"``. + ``"avatar"``, + ``"background"``, + ``"gallery"``, + ``"scraps"``, + ``"journal"``, + ``"favorite"``, + ``"status"``. It is possible to use ``"all"`` instead of listing all values separately. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b538749b..dbdaac24 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -148,7 +148,7 @@ Consider all listed sites to potentially be NSFW. DeviantArt https://www.deviantart.com/ - Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches + Avatars, Backgrounds, Collections, Deviations, Favorites, Folders, Followed Users, Galleries, Gallery Searches, Journals, Popular Images, Scraps, Search Results, Sta.sh, Status Updates, Tag Searches, User Profiles, Watches OAuth diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2ba47e1e..4b5f1d77 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) or match.group(2) + self.user = (match.group(1) or match.group(2)).lower() self.offset = 0 def _init(self): @@ -104,7 +104,6 @@ class DeviantartExtractor(Extractor): raise exception.StopExtraction() else: self.subcategory = "group-" + self.subcategory - self.user = self.user.lower() self.group = True for deviation in self.deviations(): @@ -513,11 +512,13 @@ class DeviantartUserExtractor(DeviantartExtractor): def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( - (DeviantartGalleryExtractor , base + "gallery"), - (DeviantartScrapsExtractor , base + "gallery/scraps"), - (DeviantartJournalExtractor , base + "posts"), - (DeviantartStatusExtractor , base + "posts/statuses"), - (DeviantartFavoriteExtractor, base + "favourites"), + (DeviantartAvatarExtractor , base + "avatar"), + (DeviantartBackgroundExtractor, base + "banner"), + (DeviantartGalleryExtractor , base + "gallery"), + (DeviantartScrapsExtractor , base + "gallery/scraps"), + (DeviantartJournalExtractor , base + "posts"), + (DeviantartStatusExtractor , base + "posts/statuses"), + (DeviantartFavoriteExtractor , base + "favourites"), ), ("gallery",)) @@ -538,6 +539,47 @@ class DeviantartGalleryExtractor(DeviantartExtractor): return self._folder_urls(folders, "gallery", DeviantartFolderExtractor) +class DeviantartAvatarExtractor(DeviantartExtractor): + """Extractor for an artist's avatar""" + subcategory = "avatar" + archive_fmt = "a_{_username}_{index}" + pattern = BASE_PATTERN + r"/avatar" + example = "https://www.deviantart.com/USER/avatar/" + + def deviations(self): + profile = self.api.user_profile(self.user.lower()) + if profile: + url = profile["user"]["usericon"] + return ({ + "author" : profile["user"], + "category" : "avatar", + "index" : text.parse_int(url.rpartition("?")[2]), + "is_deleted" : False, + "is_downloadable": False, + "published_time" : 0, + "title" : "avatar", + "content" : { + "src": url.replace("/avatars/", "/avatars-big/", 1), + }, + },) + return () + + +class DeviantartBackgroundExtractor(DeviantartExtractor): + """Extractor for an artist's banner""" + subcategory = "background" + archive_fmt = "b_{index}" + pattern = BASE_PATTERN + r"/ba(?:nner|ckground)" + example = "https://www.deviantart.com/USER/banner/" + + def deviations(self): + try: + return (self.api.user_profile(self.user.lower()) + ["cover_deviation"]["cover_deviation"],) + except Exception: + return () + + class DeviantartFolderExtractor(DeviantartExtractor): """Extractor for deviations inside an artist's gallery folder""" subcategory = "folder" diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 4196f32c..45ee6c18 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -14,7 +14,7 @@ __tests__ = ( "#url" : "https://www.deviantart.com/shimoda7", "#category": ("", "deviantart", "user"), "#class" : deviantart.DeviantartUserExtractor, - "#pattern" : "/shimoda7/gallery$", + "#urls" : "https://www.deviantart.com/shimoda7/gallery", }, { @@ -22,8 +22,15 @@ __tests__ = ( "#category": ("", "deviantart", "user"), "#class" : deviantart.DeviantartUserExtractor, "#options" : {"include": "all"}, - "#pattern" : "/shimoda7/(gallery(/scraps)?|posts(/statuses)?|favourites)$", - "#count" : 5, + "#urls" : ( + "https://www.deviantart.com/shimoda7/avatar", + "https://www.deviantart.com/shimoda7/banner", + "https://www.deviantart.com/shimoda7/gallery", + "https://www.deviantart.com/shimoda7/gallery/scraps", + "https://www.deviantart.com/shimoda7/posts", + "https://www.deviantart.com/shimoda7/posts/statuses", + "https://www.deviantart.com/shimoda7/favourites", + ), }, { @@ -195,6 +202,93 @@ __tests__ = ( "#class" : deviantart.DeviantartGalleryExtractor, }, +{ + "#url" : "https://deviantart.com/shimoda7/avatar", + "#category": ("", "deviantart", "avatar"), + "#class" : deviantart.DeviantartAvatarExtractor, + "#urls" : "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4", + "#sha1_content": "abf2cc79b842315f2e54bfdd93bf794a0f612b6f", + + "author" : { + "type" : "premium", + "usericon": "https://a.deviantart.net/avatars/s/h/shimoda7.jpg?4", + "userid" : "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B", + "username": "shimoda7", + }, + "content" : { + "src": "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4" + }, + "da_category" : "avatar", + "date" : "dt:1970-01-01 00:00:00", + "extension" : "jpg", + "filename" : "avatar_by_shimoda7-d4", + "index" : 4, + "index_base36" : "4", + "is_deleted" : False, + "is_downloadable": False, + "is_original" : True, + "published_time" : 0, + "target" : { + "extension": "jpg", + "filename" : "avatar_by_shimoda7-d4", + "src" : "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4" + }, + "title" : "avatar", + "username" : "shimoda7", +}, + +{ + "#url" : "https://deviantart.com/gdldev/banner", + "#category": ("", "deviantart", "background"), + "#class" : deviantart.DeviantartBackgroundExtractor, + "#pattern" : r"https://wixmp-\w+\.wixmp\.com/f/b042e0ae-a7ff-420b-a41a-b35503427360/dgntyqc-3deebb65-04b4-4085-992a-aa0c0e7e225d\.png\?token=ey[\w.-]+$", + "#sha1_content": "980eaa76ce515f1b6bef60dfadf26a5bbe9c583f", + + "allows_comments" : True, + "author" : { + "type" : "regular", + "usericon": "https://a.deviantart.net/avatars/g/d/gdldev.jpg?2", + "userid" : "1A12BA26-33C2-AA0A-7678-0B6DFBA7AC8E", + "username": "gdldev" + }, + "category_path" : "", + "content" : { + "filename" : "banner_by_gdldev_dgntyqc.png", + "filesize" : 84510, + "height" : 4000, + "src" : r"re:https://wixmp-\w+\.wixmp\.com/f/b042e0ae-a7ff-420b-a41a-b35503427360/dgntyqc-3deebb65-04b4-4085-992a-aa0c0e7e225d\.png\?token=ey[\w.-]+$", + "transparency": False, + "width" : 6400 + }, + "da_category" : "Uncategorized", + "date" : "dt:2024-01-02 21:16:06", + "deviationid" : "8C8D6B28-766A-DE21-7F7D-CE055C3BD50A", + "download_filesize": 84510, + "extension" : "png", + "filename" : "banner_by_gdldev-dgntyqc", + "index" : 1007488020, + "index_base36" : "gntyqc", + "is_blocked" : False, + "is_deleted" : False, + "is_downloadable" : True, + "is_favourited" : False, + "is_mature" : False, + "is_original" : True, + "is_published" : False, + "preview" : dict, + "printid" : None, + "published_time" : 1704230166, + "stats" : { + "comments" : 0, + "favourites": 0, + }, + "target" : dict, + "thumbs" : list, + "title" : "Banner", + "url" : "https://sta.sh/0198jippkeys", + "username" : "gdldev", +}, + { "#url" : "https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", "#comment" : "user", From 00570028a365b514b8636d434112ad30e333f9a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 3 Jan 2024 01:25:50 +0100 Subject: [PATCH 24/77] [cookies] fix macOS Firefox profile path https://github.com/yt-dlp/yt-dlp/commit/85b33f5c163f60dbd089a6b9bc2ba1366d3ddf93 --- gallery_dl/cookies.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 416cc9a1..478abb63 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -215,9 +215,11 @@ def _firefox_cookies_database(profile=None, container=None): def _firefox_browser_directory(): if sys.platform in ("win32", "cygwin"): - return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles") + return os.path.expandvars( + r"%APPDATA%\Mozilla\Firefox\Profiles") if sys.platform == "darwin": - return os.path.expanduser("~/Library/Application Support/Firefox") + return os.path.expanduser( + "~/Library/Application Support/Firefox/Profiles") return os.path.expanduser("~/.mozilla/firefox") From 7eaf648f2e937114d5c3a2c60a98b3469acb5b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 Jan 2024 15:01:33 +0100 Subject: [PATCH 25/77] [fanbox] add 'metadata' option (#4921) extracts 'plan' and extended 'user' metadata --- docs/configuration.rst | 17 +++++++++- gallery_dl/extractor/fanbox.py | 59 +++++++++++++++++++++++++++++++--- test/results/fanbox.py | 37 +++++++++++++++++++++ 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index cbc54a7d..8a1752ee 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1569,7 +1569,7 @@ Default ``false`` Example * ``notes,pools`` - * ``["notes", "pools"`` + * ``["notes", "pools"]`` Description Extract additional metadata (notes, pool metadata) if available. @@ -1711,6 +1711,21 @@ Description * ``false``: Ignore embeds. +extractor.fanbox.metadata +------------------------- +Type + * ``bool`` + * ``string`` + * ``list`` of ``strings`` +Default + ``false`` +Example + * ``user,plan`` + * ``["user", "plan"]`` +Description + Extract ``plan`` and extended ``user`` metadata. + + extractor.flickr.access-token & .access-token-secret ---------------------------------------------------- Type diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 4572bea6..61a39283 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text +from ..cache import memcache import re BASE_PATTERN = ( @@ -27,8 +28,20 @@ class FanboxExtractor(Extractor): _warning = True def _init(self): + self.headers = {"Origin": self.root} self.embeds = self.config("embeds", True) + includes = self.config("metadata") + if includes: + if isinstance(includes, str): + includes = includes.split(",") + elif not isinstance(includes, (list, tuple)): + includes = ("user", "plan") + self._meta_user = ("user" in includes) + self._meta_plan = ("plan" in includes) + else: + self._meta_user = self._meta_plan = False + if self._warning: if not self.cookies_check(("FANBOXSESSID",)): self.log.warning("no 'FANBOXSESSID' cookie set") @@ -43,11 +56,9 @@ class FanboxExtractor(Extractor): """Return all relevant post objects""" def _pagination(self, url): - headers = {"Origin": self.root} - while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=headers).json()["body"] + body = self.request(url, headers=self.headers).json()["body"] for item in body["items"]: try: yield self._get_post_data(item["id"]) @@ -58,9 +69,8 @@ class FanboxExtractor(Extractor): def _get_post_data(self, post_id): """Fetch and process post data""" - headers = {"Origin": self.root} url = "https://api.fanbox.cc/post.info?postId="+post_id - post = self.request(url, headers=headers).json()["body"] + post = self.request(url, headers=self.headers).json()["body"] content_body = post.pop("body", None) if content_body: @@ -98,8 +108,47 @@ class FanboxExtractor(Extractor): post["text"] = content_body.get("text") if content_body else None post["isCoverImage"] = False + if self._meta_user: + post["user"] = self._get_user_data(post["creatorId"]) + if self._meta_plan: + plans = self._get_plan_data(post["creatorId"]) + post["plan"] = plans[post["feeRequired"]] + return content_body, post + @memcache(keyarg=1) + def _get_user_data(self, creator_id): + url = "https://api.fanbox.cc/creator.get" + params = {"creatorId": creator_id} + data = self.request(url, params=params, headers=self.headers).json() + + user = data["body"] + user.update(user.pop("user")) + + return user + + @memcache(keyarg=1) + def _get_plan_data(self, creator_id): + url = "https://api.fanbox.cc/plan.listCreator" + params = {"creatorId": creator_id} + data = self.request(url, params=params, headers=self.headers).json() + + plans = {0: { + "id" : "", + "title" : "", + "fee" : 0, + "description" : "", + "coverImageUrl" : "", + "creatorId" : creator_id, + "hasAdultContent": None, + "paymentMethod" : None, + }} + for plan in data["body"]: + del plan["user"] + plans[plan["fee"]] = plan + + return plans + def _get_urls_from_post(self, content_body, post): num = 0 cover_image = post.get("coverImageUrl") diff --git a/test/results/fanbox.py b/test/results/fanbox.py index 78f7fe54..32f13096 100644 --- a/test/results/fanbox.py +++ b/test/results/fanbox.py @@ -86,6 +86,43 @@ __tests__ = ( "content": r"re:(?s)^Greetings from FANBOX.\n \nAs of Monday, September 5th, 2022, we are happy to announce the start of the FANBOX hashtag event #MySetupTour ! \nAbout the event\nTo join this event .+ \nPlease check this page for further details regarding the Privacy & Terms.\nhttps://fanbox.pixiv.help/.+/10184952456601\n\n\nThank you for your continued support of FANBOX.$", }, +{ + "#url" : "https://official-en.fanbox.cc/posts/7022572", + "#comment" : "'plan' and 'user' metadata (#4921)", + "#category": ("", "fanbox", "post"), + "#class" : fanbox.FanboxPostExtractor, + "#options" : {"metadata": True}, + + "plan": { + "coverImageUrl" : "", + "creatorId" : "official-en", + "description" : "", + "fee" : 0, + "hasAdultContent": None, + "id" : "", + "paymentMethod" : None, + "title" : "", + }, + "user": { + "coverImageUrl" : "https://pixiv.pximg.net/c/1620x580_90_a2_g5/fanbox/public/images/creator/74349833/cover/n9mX8q4tUXHXXj7sK1RPWyUu.jpeg", + "creatorId" : "official-en", + "description" : "This is the official English pixivFANBOX account! \n(official Japanese account: https://official.fanbox.cc/ )\n\npixivFANBOX is a subscription service for building a reliable fan community where creators can nurture creative lifestyles together with their fans.\nFollowers can be notified of the updates from their favorite creators they are following. Supporters can enjoy closer communication with creators through exclusive content and their latest information.\n", + "hasAdultContent" : False, + "hasBoothShop" : False, + "iconUrl" : "https://pixiv.pximg.net/c/160x160_90_a2_g5/fanbox/public/images/user/74349833/icon/oJH0OoGoSixLrJXlnneNvC95.jpeg", + "isAcceptingRequest": False, + "isFollowed" : False, + "isStopped" : False, + "isSupported" : False, + "name" : "pixivFANBOX English", + "profileItems" : [], + "profileLinks" : [ + "https://twitter.com/pixivfanbox", + ], + "userId" : "74349833", + }, +}, + { "#url" : "https://mochirong.fanbox.cc/posts/3746116", "#comment" : "imageMap file order (#2718)", From a86775f6175460ea6ceb963567000c2b6e7002fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 4 Jan 2024 15:05:33 +0100 Subject: [PATCH 26/77] [gelbooru] fix 'favorite' extractor (#4903) lots of +1/-1 and = last: + pnum, last = divmod(count-1, self.per_page) + if self.offset > last: + # page number change self.offset -= last - diff, self.offset = divmod(self.offset, self.per_page) + diff, self.offset = divmod(self.offset-1, self.per_page) pnum -= diff + 1 skip = self.offset @@ -183,8 +184,8 @@ class GelbooruFavoriteExtractor(GelbooruBase, while True: favs = self._api_request(params, "favorite") - favs.reverse() + if skip: favs = favs[skip:] skip = 0 From 0f3013610997d67c6c952e2974597af7797e6957 Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Thu, 4 Jan 2024 21:38:59 +0530 Subject: [PATCH 27/77] [zzup] add 'gallery' extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/zzup.py | 40 ++++++++++++++++++++++++++++++++ test/results/zzup.py | 31 +++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 gallery_dl/extractor/zzup.py create mode 100644 test/results/zzup.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9c684bc0..8d974ecc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -178,6 +178,7 @@ modules = [ "xhamster", "xvideos", "zerochan", + "zzup", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py new file mode 100644 index 00000000..45b0cd80 --- /dev/null +++ b/gallery_dl/extractor/zzup.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import GalleryExtractor +from .. import text + + +class ZzupGalleryExtractor(GalleryExtractor): + category = "zzup" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{slug}_{num:>03}.{extension}" + archive_fmt = "{slug}_{num}" + root = "https://zzup.com" + pattern = (r"(?:https?://)?(?:www\.)?zzup\.com(/content" + r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html") + example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html" + + def __init__(self, match): + url = "{}/{}/index.html".format(self.root, match.group(1)) + GalleryExtractor.__init__(self, match, url) + self.slug = match.group(2) + + def metadata(self, page): + return { + "slug" : self.slug, + "title": text.unescape(text.extr( + page, "", ""))[:-11], + } + + def images(self, page): + path = text.extr(page, 'class="picbox">05}" + p2[4:] + return [(ufmt.format(num), None) for num in range(1, count + 1)] diff --git a/test/results/zzup.py b/test/results/zzup.py new file mode 100644 index 00000000..ad68e41c --- /dev/null +++ b/test/results/zzup.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import zzup + + +__tests__ = ( +{ + "#url" : "https://zzup.com/content/NjM=/MetArt_20080206_viki_c_sensazioni_by_ingret/OTE=/index.html", + "#category": ("", "zzup", "gallery"), + "#class" : zzup.ZzupGalleryExtractor, + "#pattern" : r"https://zzup\.com/MjAxNjc3OTIyMjE5Nzk=/showimage/zzup-8769086487/image00\d\d\d-5896498214-1-9689595623/MetArt-20080206_viki_c_sensazioni_by_ingret/9879560327/zzup.com.jpg", + + "slug" : "MetArt_20080206_viki_c_sensazioni_by_ingret", + "title" : "MetArt 20080206 viki c sensazioni by ingret", + "num" : int, + "count" : 135, +}, + +{ + "#url" : "https://zzup.com/content/MTc2MDYxMw==/Courtesan/NDA=/page-1.html", + "#category": ("", "zzup", "gallery"), + "#class" : zzup.ZzupGalleryExtractor, + "#pattern" : r"https://zzup.com/MjAxNjc3OTIyMjE5Nzk=/showimage/zzup-8769086487/image000\d\d-5896498214-40-9689595623/Courtesan/9879560327/zzup.com.jpg", +}, + +) + From 0ab0a10d2dff8a32177f45c23f48c58e5493b725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 02:26:22 +0100 Subject: [PATCH 28/77] [jpgfish] update domain --- gallery_dl/extractor/chevereto.py | 2 +- test/results/jpgfish.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 21166bdb..2bf200b0 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -35,7 +35,7 @@ class CheveretoExtractor(BaseExtractor): BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { - "root": "https://jpg2.su", + "root": "https://jpg4.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, "pixl": { diff --git a/test/results/jpgfish.py b/test/results/jpgfish.py index bf35bf7a..354e2ff5 100644 --- a/test/results/jpgfish.py +++ b/test/results/jpgfish.py @@ -9,7 +9,7 @@ from gallery_dl.extractor import chevereto __tests__ = ( { - "#url" : "https://jpg2.su/img/funnymeme.LecXGS", + "#url" : "https://jpg4.su/img/funnymeme.LecXGS", "#category": ("chevereto", "jpgfish", "image"), "#class" : chevereto.CheveretoImageExtractor, "#urls" : "https://simp3.jpg.church/images/funnymeme.jpg", From b4bcf40278e79628a81f58d7640f41630a9c66b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 17:18:33 +0100 Subject: [PATCH 29/77] [weibo] fix AttributeError in 'user' extractor (#5022) yet another bug caused by a383eca7 --- gallery_dl/extractor/weibo.py | 3 --- test/results/weibo.py | 47 +++++++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 7413b5a0..3bd06489 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -225,9 +225,6 @@ class WeiboUserExtractor(WeiboExtractor): pattern = USER_PATTERN + r"(?:$|#)" example = "https://weibo.com/USER" - def initialize(self): - pass - def items(self): base = "{}/u/{}?tabtype=".format(self.root, self._user_id()) return self._dispatch_extractors(( diff --git a/test/results/weibo.py b/test/results/weibo.py index 639994c0..68e27f8f 100644 --- a/test/results/weibo.py +++ b/test/results/weibo.py @@ -13,7 +13,35 @@ __tests__ = ( "#url" : "https://weibo.com/1758989602", "#category": ("", "weibo", "user"), "#class" : weibo.WeiboUserExtractor, - "#pattern" : r"^https://weibo\.com/u/1758989602\?tabtype=feed$", + "#urls" : "https://weibo.com/u/1758989602?tabtype=feed", +}, + +{ + "#url" : "https://weibo.com/1758989602", + "#category": ("", "weibo", "user"), + "#class" : weibo.WeiboUserExtractor, + "#options" : {"include": "all"}, + "#urls" : ( + "https://weibo.com/u/1758989602?tabtype=home", + "https://weibo.com/u/1758989602?tabtype=feed", + "https://weibo.com/u/1758989602?tabtype=video", + "https://weibo.com/u/1758989602?tabtype=newVideo", + "https://weibo.com/u/1758989602?tabtype=album", + ), +}, + +{ + "#url" : "https://weibo.com/zhouyuxi77", + "#category": ("", "weibo", "user"), + "#class" : weibo.WeiboUserExtractor, + "#urls" : "https://weibo.com/u/7488709788?tabtype=feed", +}, + +{ + "#url" : "https://www.weibo.com/n/周于希Sally", + "#category": ("", "weibo", "user"), + "#class" : weibo.WeiboUserExtractor, + "#urls" : "https://weibo.com/u/7488709788?tabtype=feed", }, { @@ -69,9 +97,11 @@ __tests__ = ( "#class" : weibo.WeiboFeedExtractor, "#range" : "1", - "status": {"user": { - "id" : 7488709788, -}}, + "status": { + "user": { + "id": 7488709788, + }, + }, }, { @@ -80,9 +110,12 @@ __tests__ = ( "#class" : weibo.WeiboFeedExtractor, "#range" : "1", - "status": {"user": { - "id" : 7488709788, -}}, + + "status": { + "user": { + "id": 7488709788, + }, + }, }, { From e61f016465c0edd71725c11dadfb66da57decce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 17:56:39 +0100 Subject: [PATCH 30/77] [szurubooru] support 'snootbooru.com' (#5023) --- docs/supportedsites.md | 8 ++- gallery_dl/extractor/szurubooru.py | 4 ++ test/results/snootbooru.py | 79 ++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 test/results/snootbooru.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dbdaac24..c0fee2a8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1037,7 +1037,7 @@ Consider all listed sites to potentially be NSFW. JPG Fish - https://jpg2.su/ + https://jpg4.su/ Albums, individual Images, User Profiles @@ -1409,6 +1409,12 @@ Consider all listed sites to potentially be NSFW. Posts, Tag Searches + + Snootbooru + https://snootbooru.com/ + Posts, Tag Searches + + URL Shorteners diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 5415bf30..08cccab6 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -87,6 +87,10 @@ BASE_PATTERN = SzurubooruExtractor.update({ "root": "https://booru.bcbnsfw.space", "pattern": r"booru\.bcbnsfw\.space", }, + "snootbooru": { + "root": "https://snootbooru.com", + "pattern": r"snootbooru\.com", + }, }) diff --git a/test/results/snootbooru.py b/test/results/snootbooru.py new file mode 100644 index 00000000..822bad6e --- /dev/null +++ b/test/results/snootbooru.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import szurubooru + + +__tests__ = ( +{ + "#url" : "https://snootbooru.com/posts/query=sport", + "#category": ("szurubooru", "snootbooru", "tag"), + "#class" : szurubooru.SzurubooruTagExtractor, + "#pattern" : r"https://snootbooru\.com/data/posts/\d+_[0-9a-f]{16}\.\w+", + "#count" : range(35, 50), +}, + +{ + "#url" : "https://snootbooru.com/post/14511", + "#category": ("szurubooru", "snootbooru", "post"), + "#class" : szurubooru.SzurubooruPostExtractor, + "#urls" : "https://snootbooru.com/data/posts/14511_e753313112755da6.png", + "#sha1_content": "e69e61e61c5372514808480aae3a8e355c9cd6fb", + + "canvasHeight" : 1000, + "canvasWidth" : 1414, + "checksum" : "e69e61e61c5372514808480aae3a8e355c9cd6fb", + "checksumMD5" : "f4f4ddfcbdf367f466ede0980acb3d7d", + "commentCount" : int, + "comments" : list, + "contentUrl" : "data/posts/14511_e753313112755da6.png", + "creationTime" : "2023-12-02T01:11:01.433664Z", + "date" : "dt:2023-12-02 01:11:01", + "extension" : "png", + "favoriteCount": int, + "favoritedBy" : list, + "featureCount" : int, + "fileSize" : 270639, + "filename" : "14511_e753313112755da6", + "flags" : [], + "hasCustomThumbnail": False, + "id" : 14511, + "lastEditTime" : "2023-12-02T01:12:09.500217Z", + "lastFeatureTime": None, + "mimeType" : "image/png", + "noteCount" : 0, + "notes" : [], + "ownFavorite" : False, + "ownScore" : 0, + "pools" : [], + "relationCount": 0, + "relations" : [], + "safety" : "safe", + "score" : 0, + "source" : None, + "tagCount" : 3, + "tags" : [ + "transparent", + "sport", + "text", + ], + "tags_default" : [ + "sport", + "text" + ], + "tags_type" : [ + "transparent" + ], + "thumbnailUrl" : "data/generated-thumbnails/14511_e753313112755da6.jpg", + "type" : "image", + "user" : { + "avatarUrl": "data/avatars/komp.png", + "name": "komp" + }, + "version" : 2, +}, + +) From 217fa7f8a1d42c53730807adfcbf9e4b730902d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 18:16:33 +0100 Subject: [PATCH 31/77] include 'test/results' in flake8 checks --- setup.cfg | 3 +- test/results/__init__.py | 1 - test/results/blogspot.py | 1 - test/results/nitter1d4us.py | 6 +- test/results/pillowfort.py | 124 ++++++++++++++++++------------------ test/results/unsplash.py | 2 +- 6 files changed, 67 insertions(+), 70 deletions(-) diff --git a/setup.cfg b/setup.cfg index e115e874..a5e01b66 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,8 @@ [flake8] -exclude = .git,__pycache__,build,dist,archive,results +exclude = .git,__pycache__,build,dist,archive ignore = E203,E226,W504 per-file-ignores = setup.py: E501 gallery_dl/extractor/500px.py: E501 gallery_dl/extractor/mangapark.py: E501 + test/results/*.py: E122,E241,E402,E501 diff --git a/test/results/__init__.py b/test/results/__init__.py index c54bea56..0865693b 100644 --- a/test/results/__init__.py +++ b/test/results/__init__.py @@ -5,7 +5,6 @@ # published by the Free Software Foundation. import os -import sys import functools __directory__ = os.path.dirname(__file__) diff --git a/test/results/blogspot.py b/test/results/blogspot.py index 83f4e5f7..75ecff92 100644 --- a/test/results/blogspot.py +++ b/test/results/blogspot.py @@ -43,7 +43,6 @@ __tests__ = ( "extension": "jpg", "filename" : "Icy-Moonrise---For-Web", "num" : 1, - "num" : int, "url" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", }, diff --git a/test/results/nitter1d4us.py b/test/results/nitter1d4us.py index 4c6c3d12..b816b44f 100644 --- a/test/results/nitter1d4us.py +++ b/test/results/nitter1d4us.py @@ -41,10 +41,8 @@ __tests__ = ( "#category": ("nitter", "nitter.1d4.us", "tweet"), "#class" : nitter.NitterTweetExtractor, - "content": r"""re:Gear up for #PokemonSwordShieldEX with special Mystery Gifts! - -You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, plus some very useful items. It’s our \(Mystery\) Gift to you, Trainers! - + "content": r"""re:Gear up for #PokemonSwordShieldEX with special Mystery Gifts! \n +You’ll be able to receive four Galarian form Pokémon with Hidden Abilities, plus some very useful items. It’s our \(Mystery\) Gift to you, Trainers! \n ❓🎁➡️ """, }, diff --git a/test/results/pillowfort.py b/test/results/pillowfort.py index b04be6f3..0d260b91 100644 --- a/test/results/pillowfort.py +++ b/test/results/pillowfort.py @@ -71,58 +71,58 @@ __tests__ = ( "#pattern" : r"https://img2\.pillowfort\.social/posts/c8e834bc09e6_Brandee\.png", "#count" : 1, - "avatar_frame" : None, - "avatar_id" : None, - "avatar_url" : "https://img3.pillowfort.social/avatars/000/037/139/original/437.jpg?1545015697", - "b2_lg_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee.png", - "b2_sm_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee_small.png", - "cached_tag_list": "art, digital art, mermaid, mermaids, underwater, seaweed, illustration, speed paint", - "col" : 0, - "comm_screening_status": "not_applicable", - "commentable" : True, - "comments_count": 0, - "community_id" : None, - "concealed_comment_warning": None, - "content" : "

Sea Bed

", - "created_at" : r"re:2020-02-.+", - "currentuser_default_avatar_url": None, - "currentuser_multi_avi": None, - "date" : "dt:2020-02-29 17:09:03", - "deleted" : None, - "deleted_at" : None, - "deleted_by_mod": None, - "deleted_for_flag_id": None, - "embed_code" : None, - "extension" : "png", - "filename" : "Brandee", - "hash" : "c8e834bc09e6", - "id" : 720167, - "last_activity" : r"re:2020-02-.+", - "last_activity_elapsed": r"re:\d+ months", - "last_edited_at": None, - "likes_count" : 8, - "media_type" : "picture", - "nsfw" : False, - "num" : 1, - "original_post_id": None, - "original_post_user_id": None, - "pic_row_last" : 1, - "picture_content_type": None, - "picture_file_name": None, - "picture_file_size": None, - "picture_updated_at": None, - "post_id" : 1124584, - "post_type" : "picture", - "privacy" : "public", - "reblog_copy_info": [], - "rebloggable" : True, - "reblogged_from_post_id": None, - "reblogged_from_user_id": None, - "reblogs_count" : int, - "row" : 1, - "small_image_url": None, - "tag_list" : None, - "tags" : [ + "avatar_frame" : None, + "avatar_id" : None, + "avatar_url" : "https://img3.pillowfort.social/avatars/000/037/139/original/437.jpg?1545015697", + "b2_lg_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee.png", + "b2_sm_url" : "https://img2.pillowfort.social/posts/c8e834bc09e6_Brandee_small.png", + "cached_tag_list": "art, digital art, mermaid, mermaids, underwater, seaweed, illustration, speed paint", + "col" : 0, + "comm_screening_status": "not_applicable", + "commentable" : True, + "comments_count": 0, + "community_id" : None, + "concealed_comment_warning": None, + "content" : "

Sea Bed

", + "created_at" : r"re:2020-02-.+", + "currentuser_default_avatar_url": None, + "currentuser_multi_avi": None, + "date" : "dt:2020-02-29 17:09:03", + "deleted" : None, + "deleted_at" : None, + "deleted_by_mod": None, + "deleted_for_flag_id": None, + "embed_code" : None, + "extension" : "png", + "filename" : "Brandee", + "hash" : "c8e834bc09e6", + "id" : 720167, + "last_activity" : r"re:2020-02-.+", + "last_activity_elapsed": r"re:\d+ months", + "last_edited_at": None, + "likes_count" : 8, + "media_type" : "picture", + "nsfw" : False, + "num" : 1, + "original_post_id": None, + "original_post_user_id": None, + "pic_row_last" : 1, + "picture_content_type": None, + "picture_file_name": None, + "picture_file_size": None, + "picture_updated_at": None, + "post_id" : 1124584, + "post_type" : "picture", + "privacy" : "public", + "reblog_copy_info": [], + "rebloggable" : True, + "reblogged_from_post_id": None, + "reblogged_from_user_id": None, + "reblogs_count" : int, + "row" : 1, + "small_image_url": None, + "tag_list" : None, + "tags" : [ "art", "digital art", "mermaid", @@ -130,16 +130,16 @@ __tests__ = ( "underwater", "seaweed", "illustration", - "speed paint" - ], - "time_elapsed" : r"re:\d+ months", - "timestamp" : str, - "title" : "", - "updated_at" : r"re:2020-02-.+", - "url" : "", - "user_concealed": None, - "user_id" : 37201, - "username" : "Maclanahan", + "speed paint", + ], + "time_elapsed" : r"re:\d+ months", + "timestamp" : str, + "title" : "", + "updated_at" : r"re:2020-02-.+", + "url" : "", + "user_concealed": None, + "user_id" : 37201, + "username" : "Maclanahan", }, { diff --git a/test/results/unsplash.py b/test/results/unsplash.py index e3413aff..01692eec 100644 --- a/test/results/unsplash.py +++ b/test/results/unsplash.py @@ -81,7 +81,7 @@ __tests__ = ( "full" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=srgb&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=85", "raw" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3", "regular" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=1080", - "small" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=400", + "small" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=400", "small_s3": "https://s3.us-west-2.amazonaws.com/images.unsplash.com/small/photo-1601823984263-b87b59798b70", "thumb" : "https://images.unsplash.com/photo-1601823984263-b87b59798b70?crop=entropy&cs=tinysrgb&fit=max&fm=jpg&ixid=M3wxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNzAwODY2NDE4fA&ixlib=rb-4.0.3&q=80&w=200", }, From 11150a7d72a68647b5960702e984b15d784b061f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 5 Jan 2024 21:32:04 +0100 Subject: [PATCH 32/77] [nudecollect] remove module --- docs/supportedsites.md | 12 ++-- gallery_dl/extractor/__init__.py | 1 - gallery_dl/extractor/nudecollect.py | 87 ----------------------------- test/results/nudecollect.py | 56 ------------------- test/results/zzup.py | 1 - 5 files changed, 6 insertions(+), 151 deletions(-) delete mode 100644 gallery_dl/extractor/nudecollect.py delete mode 100644 test/results/nudecollect.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c0fee2a8..23459d0c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -589,12 +589,6 @@ Consider all listed sites to potentially be NSFW. Albums - - Nudecollect - https://nudecollect.com/ - Albums, individual Images - - Patreon https://www.patreon.com/ @@ -1003,6 +997,12 @@ Consider all listed sites to potentially be NSFW. individual Images, Tag Searches Supported + + Zzup + https://zzup.com/ + Galleries + + かべうち https://kabe-uchiroom.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8d974ecc..6fca0120 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -107,7 +107,6 @@ modules = [ "nitter", "nozomi", "nsfwalbum", - "nudecollect", "paheal", "patreon", "philomena", diff --git a/gallery_dl/extractor/nudecollect.py b/gallery_dl/extractor/nudecollect.py deleted file mode 100644 index bda5d774..00000000 --- a/gallery_dl/extractor/nudecollect.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://nudecollect.com/""" - -from .common import GalleryExtractor -from .. import text - - -class NudecollectExtractor(GalleryExtractor): - """Base class for Nudecollect extractors""" - category = "nudecollect" - directory_fmt = ("{category}", "{title}") - filename_fmt = "{slug}_{num:>03}.{extension}" - archive_fmt = "{slug}_{num}" - root = "https://www.nudecollect.com" - - def request(self, url, **kwargs): - kwargs["allow_redirects"] = False - return GalleryExtractor.request(self, url, **kwargs) - - @staticmethod - def get_title(page): - return text.unescape(text.extr(page, "", ""))[31:] - - @staticmethod - def get_image(page): - return text.extr(page, '05}" + p2[4:] - return [(ufmt.format(num), None) for num in range(1, self.count + 1)] diff --git a/test/results/nudecollect.py b/test/results/nudecollect.py deleted file mode 100644 index 423c915f..00000000 --- a/test/results/nudecollect.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -from gallery_dl.extractor import nudecollect - - -__tests__ = ( -{ - "#url" : "https://www.nudecollect.com/content/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/image-4-pics-108-mirror-43.html", - "#category": ("", "nudecollect", "image"), - "#class" : nudecollect.NudecollectImageExtractor, - "#pattern" : r"https://mirror\d+\.nudecollect\.com/showimage/nudecollect-8769086487/image00004-5896498214-43-9689595623/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/9879560327/nudecollect\.com\.jpg", - - "slug" : "20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust", - "title" : "20201220 Teenpornstorage Patritcy Vanessa Lesbian Lust", - "num" : 4, - "count" : 108, - "mirror": 43, -}, - -{ - "#url" : "https://www.nudecollect.com/content/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/image-10-pics-108-mirror-43.html", - "#category": ("", "nudecollect", "image"), - "#class" : nudecollect.NudecollectImageExtractor, -}, - -{ - "#url" : "https://www.nudecollect.com/content/20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex_with_alluring_Czech_babes_x125_1080px/index-mirror-67-125.html", - "#category": ("", "nudecollect", "album"), - "#class" : nudecollect.NudecollectAlbumExtractor, - "#pattern" : r"https://mirror\d+\.nudecollect\.com/showimage/nudecollect-8769086487/image00\d\d\d-5896498214-67-9689595623/20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex_with_alluring_Czech_babes_x125_1080px/9879560327/nudecollect\.com\.jpg", - "#count" : 125, - - "slug" : "20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex_with_alluring_Czech_babes_x125_1080px", - "title" : "20170219 TheWhiteBoxxx Caprice Tracy Loves Hot ass fingering and sensual lesbian sex with alluring Czech babes x125 1080px", - "num" : int, - "mirror": 67, -}, - -{ - "#url" : "https://www.nudecollect.com/content/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/page-1-pics-108-mirror-43.html", - "#category": ("", "nudecollect", "album"), - "#class" : nudecollect.NudecollectAlbumExtractor, - "#pattern" : r"https://mirror\d+\.nudecollect\.com/showimage/nudecollect-8769086487/image00\d\d\d-5896498214-43-9689595623/20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust/9879560327/nudecollect\.com\.jpg", - "#count" : 108, - - "slug" : "20201220_Teenpornstorage_Patritcy_Vanessa_Lesbian_Lust", - "title" : "20201220 Teenpornstorage Patritcy Vanessa Lesbian Lust", - "num" : int, - "mirror": 43, -}, - -) diff --git a/test/results/zzup.py b/test/results/zzup.py index ad68e41c..87b9bada 100644 --- a/test/results/zzup.py +++ b/test/results/zzup.py @@ -28,4 +28,3 @@ __tests__ = ( }, ) - From 3aa24c3744474a4fe06ebdec946a895c4f9d538c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Jan 2024 00:51:52 +0100 Subject: [PATCH 33/77] [bato] simplify and update --- docs/supportedsites.md | 2 +- gallery_dl/extractor/bato.py | 141 ++++++++++++++++------------------- test/results/bato.py | 2 +- 3 files changed, 66 insertions(+), 79 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6040cd47..c1acadd2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -99,7 +99,7 @@ Consider all listed sites to potentially be NSFW. Bato - https://bato.to + https://bato.to/ Chapters, Manga diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/bato.py index d29a58bf..83404a75 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/bato.py @@ -4,61 +4,63 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bato.to and aliases (v3x only)""" +"""Extractors for https://bato.to/""" -from .common import ChapterExtractor, MangaExtractor +from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = r"(?:https?://)?" \ - r"(?:bato\.to|dto\.to|batotoo\.com|wto\.to|mangatoto\.com)" -MANGA_PATTERN = r"/title/\d+(?:-[0-9a-z]+)*/?" -CHAPTER_PATTERN = r"/\d+(?:-vol_\d+)?-ch_\d+\.?\d*/?" +BASE_PATTERN = (r"(?:https?://)?" + r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") class BatoBase(): - """Base class for bato v3x extractors""" + """Base class for bato extractors""" category = "bato" root = "https://bato.to" + def request(self, url, **kwargs): + kwargs["encoding"] = "utf-8" + return Extractor.request(self, url, **kwargs) + class BatoChapterExtractor(BatoBase, ChapterExtractor): - """Extractor for manga chapters from bato.to""" - pattern = BASE_PATTERN + "(" + MANGA_PATTERN + CHAPTER_PATTERN + ")" - # There are three possible patterns for a chapter - example = "https://bato.to/title/12345-manga-name-with-spaces/54212-ch_1.5" - example2 = \ - "https://bato.to/title/12345-manga-name-with-spaces/54212-vol1-ch_1.5" - example3 = "https://bato.to/title/12345/54212" - # v2x, not supported - example4 = "https://bato.to/chapter/54212" + """Extractor for bato.to manga chapters""" + pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" + example = "https://bato.to/title/12345-MANGA/54321" def __init__(self, match): - self.path = match.group(1) - ChapterExtractor.__init__(self, match, self.root + self.path) + self.root = text.root_from_url(match.group(0)) + self.chapter_id = match.group(1) + url = "{}/title/0/{}".format(self.root, self.chapter_id) + ChapterExtractor.__init__(self, match, url) def metadata(self, page): - info = text.extr( - page, "", r" - Read Free Manga Online at Bato.To" - ) - info = info.encode('latin-1').decode('utf-8').replace("\n", "") + extr = text.extract_from(page) + manga, info, _ = extr("", "<").rsplit(" - ", 3) + manga_id = extr("/title/", "/") match = re.match( - r"(.+) - " - r"(?:Volume *(\d+) )?" - r"Chapter *([\d\.]+)", info) - manga, volume, chapter = match.groups() if match else ("", "", info) - chapter, sep, minor = chapter.partition(".") - title_section = text.extr(page, '<a href="' + self.path + '"', "</a>") - title = text.extr(title_section, "<!-- -->", "</span>") + r"(?:Volume\s+(\d+) )?" + r"\w+\s+(\d+)(.*)", info) + if match: + volume, chapter, minor = match.groups() + title = text.remove_html(extr( + "selected>", "</option")).partition(" : ")[2] + else: + volume = chapter = 0 + minor = "" + title = info return { "manga" : text.unescape(manga), + "manga_id" : text.parse_int(manga_id), "title" : text.unescape(title), - "author" : "", "volume" : text.parse_int(volume), "chapter" : text.parse_int(chapter), - "chapter_minor": sep + minor, + "chapter_minor": minor, + "chapter_id" : text.parse_int(self.chapter_id), + "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), } def images(self, page): @@ -71,61 +73,46 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): class BatoMangaExtractor(BatoBase, MangaExtractor): - """Extractor for manga from bato.to""" + """Extractor for bato.to manga""" reverse = False chapterclass = BatoChapterExtractor - pattern = BASE_PATTERN + "(" + MANGA_PATTERN + "$" + ")" - # There are two possible patterns for a manga - example = "https://bato.to/title/12345-manga-name-with-spaces/" - example2 = "https://bato.to/title/12345/" - # v2x, not supported - example3 = "https://bato.to/series/12345/manga-name-with-space" + pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" + example = "https://bato.to/title/12345-MANGA/" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + self.manga_id = match.group(1) + url = "{}/title/{}".format(self.root, self.manga_id) + MangaExtractor.__init__(self, match, url) def chapters(self, page): - data = {} - num_chapters = text.extr(page, ">Chapters<", "</div>") - num_chapters = text.extr(num_chapters, r"<!-- -->", r"<!-- -->") - num_chapters = text.parse_int(num_chapters) - if num_chapters == 0: - raise exception.NotFoundError("chapter") + extr = text.extract_from(page) - manga = text.extr( - page, "<title>", r" - Read Free Manga Online at Bato.To" - ) - manga = manga.encode('latin-1').decode('utf-8').replace("\n", "") - data["manga"] = manga + warning = extr(' class="alert alert-warning">', "
<") + if warning: + raise exception.StopExtraction("'%s'", text.remove_html(warning)) + data = { + "manga_id": text.parse_int(self.manga_id), + "manga" : text.unescape(extr( + "", "<").rpartition(" - ")[0]), + } + + extr('<div data-hk="0-0-0-0"', "") results = [] - for chapter_num in range(num_chapters): - chapter_info = text.extr( - page, - '<div data-hk="0-0-{}-0"'.format(chapter_num), - r"</time><!--/-->" - ) - chapter_info += r"</time><!--/-->" # so we can match the date - url, pos = text.extract(chapter_info, '<a href="', '"') + while True: + href = extr('<a href="/title/', '"') + if not href: + break - chapter = re.search(r"-ch_([\d\.]+)", url) - if chapter: - chapt_major, sep, chapt_minor = chapter.group(1).partition(".") - title = text.extr( - chapter_info, - '<span data-hk="0-0-{}-1"'.format(chapter_num), - "</span>" - ) - title = text.extr(title, r"<!--#-->", r"<!--/-->") - if title is None or title == "" or title == "<!--/-->": - title, _ = text.extract(chapter_info, ">", "</a>", pos) + chapter = href.rpartition("-ch_")[2] + chapter, sep, minor = chapter.partition(".") - date = text.extr(chapter_info, "<time", "</time>") - date = text.extr(date, 'time="', '"') + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + data["date"] = text.parse_datetime( + extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ") - data["date"] = date - data["title"] = title - data["chapter"] = text.parse_int(chapt_major) - data["chapter_minor"] = sep + chapt_minor - - if url.startswith("/"): - url = self.root + url + url = "{}/title/{}".format(self.root, href) results.append((url, data.copy())) return results diff --git a/test/results/bato.py b/test/results/bato.py index 18479f9a..672362f5 100644 --- a/test/results/bato.py +++ b/test/results/bato.py @@ -60,6 +60,6 @@ __tests__ = ( "#url" : "https://bato.to/title/134270-removed", "#category": ("", "bato", "manga"), "#class" : bato.BatoMangaExtractor, - "#exception": exception.NotFoundError + "#exception": exception.StopExtraction, } ) From b11c352d66b6f23a9cb03047d4b19f7092bb4b4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 01:49:34 +0100 Subject: [PATCH 34/77] [bato] rename to 'batoto' to use the same category name as the previous bato.to site --- docs/supportedsites.md | 2 +- gallery_dl/extractor/__init__.py | 2 +- gallery_dl/extractor/{bato.py => batoto.py} | 12 +++++----- scripts/supportedsites.py | 2 +- test/results/{bato.py => batoto.py} | 26 ++++++++++----------- 5 files changed, 22 insertions(+), 22 deletions(-) rename gallery_dl/extractor/{bato.py => batoto.py} (93%) rename test/results/{bato.py => batoto.py} (73%) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1acadd2..9dc174a8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,7 +98,7 @@ Consider all listed sites to potentially be NSFW. <td></td> </tr> <tr> - <td>Bato</td> + <td>BATO.TO</td> <td>https://bato.to/</td> <td>Chapters, Manga</td> <td></td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 99de2169..4ab9db4d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,7 +24,7 @@ modules = [ "architizer", "artstation", "aryion", - "bato", + "batoto", "bbc", "behance", "blogger", diff --git a/gallery_dl/extractor/bato.py b/gallery_dl/extractor/batoto.py similarity index 93% rename from gallery_dl/extractor/bato.py rename to gallery_dl/extractor/batoto.py index 83404a75..cd6302e6 100644 --- a/gallery_dl/extractor/bato.py +++ b/gallery_dl/extractor/batoto.py @@ -14,9 +14,9 @@ BASE_PATTERN = (r"(?:https?://)?" r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") -class BatoBase(): - """Base class for bato extractors""" - category = "bato" +class BatotoBase(): + """Base class for batoto extractors""" + category = "batoto" root = "https://bato.to" def request(self, url, **kwargs): @@ -24,7 +24,7 @@ class BatoBase(): return Extractor.request(self, url, **kwargs) -class BatoChapterExtractor(BatoBase, ChapterExtractor): +class BatotoChapterExtractor(BatotoBase, ChapterExtractor): """Extractor for bato.to manga chapters""" pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" example = "https://bato.to/title/12345-MANGA/54321" @@ -72,10 +72,10 @@ class BatoChapterExtractor(BatoBase, ChapterExtractor): ] -class BatoMangaExtractor(BatoBase, MangaExtractor): +class BatotoMangaExtractor(BatotoBase, MangaExtractor): """Extractor for bato.to manga""" reverse = False - chapterclass = BatoChapterExtractor + chapterclass = BatotoChapterExtractor pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" example = "https://bato.to/title/12345-MANGA/" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index e3738b8b..ea6c2597 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -32,7 +32,7 @@ CATEGORY_MAP = { "atfbooru" : "ATFBooru", "b4k" : "arch.b4k.co", "baraag" : "baraag", - "bato" : "Bato", + "batoto" : "BATO.TO", "bbc" : "BBC", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", diff --git a/test/results/bato.py b/test/results/batoto.py similarity index 73% rename from test/results/bato.py rename to test/results/batoto.py index 672362f5..f3853247 100644 --- a/test/results/bato.py +++ b/test/results/batoto.py @@ -4,14 +4,14 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -from gallery_dl.extractor import bato +from gallery_dl.extractor import batoto from gallery_dl import exception __tests__ = ( { "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official/1681030-ch_8", - "#category": ("", "bato", "chapter"), - "#class" : bato.BatoChapterExtractor, + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, "#count" : 66, "manga" : "I Shall Master this Family! [Official]", @@ -21,8 +21,8 @@ __tests__ = ( { "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", "#comment" : "volume (vol) in url", - "#category": ("", "bato", "chapter"), - "#class" : bato.BatoChapterExtractor, + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, "#count" : 7, "manga" : "86--EIGHTY-SIX (Official)", @@ -32,8 +32,8 @@ __tests__ = ( }, { "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 21", "manga" : "Futsutsuka na Akujo de wa Gozaimasu ga - Suuguu Chouso Torikae Den (Official)", @@ -41,8 +41,8 @@ __tests__ = ( { "#url" : "https://bato.to/title/104929-86-eighty-six-official", "#comment" : "Manga with number in name", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 18", "manga" : "86--EIGHTY-SIX (Official)", @@ -50,16 +50,16 @@ __tests__ = ( { "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", "#comment" : "Non-English translation (Indonesian)", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 29", "manga" : "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", }, { "#url" : "https://bato.to/title/134270-removed", - "#category": ("", "bato", "manga"), - "#class" : bato.BatoMangaExtractor, + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, "#exception": exception.StopExtraction, } ) From 8e1a2b5446dd2b4e4933435da469ea2e76e04eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 02:16:43 +0100 Subject: [PATCH 35/77] [komikcast] update domain to 'komikcast.lol' (#5027) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/komikcast.py | 14 ++++++------- test/results/komikcast.py | 35 ++++++++++++++++++++++++++++--- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9aa51a08..d046aad4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -453,7 +453,7 @@ Consider all listed sites to potentially be NSFW. </tr> <tr> <td>Komikcast</td> - <td>https://komikcast.site/</td> + <td>https://komikcast.lol/</td> <td>Chapters, Manga</td> <td></td> </tr> diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index a3e01305..53411a2e 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.site/""" +"""Extractors for https://komikcast.lol/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)" class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.site" + root = "https://komikcast.lol" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -46,9 +46,9 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): - """Extractor for manga-chapters from komikcast.site""" + """Extractor for manga-chapters from komikcast.lol""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" - example = "https://komikcast.site/chapter/TITLE/" + example = "https://komikcast.lol/chapter/TITLE/" def metadata(self, page): info = text.extr(page, "<title>", " - Komikcast<") @@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): - """Extractor for manga from komikcast.site""" + """Extractor for manga from komikcast.lol""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" - example = "https://komikcast.site/komik/TITLE" + example = "https://komikcast.lol/komik/TITLE" def chapters(self, page): results = [] diff --git a/test/results/komikcast.py b/test/results/komikcast.py index 9a246009..89fcbf10 100644 --- a/test/results/komikcast.py +++ b/test/results/komikcast.py @@ -8,19 +8,48 @@ from gallery_dl.extractor import komikcast __tests__ = ( +{ + "#url" : "https://komikcast.lol/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", + "#category": ("", "komikcast", "chapter"), + "#class" : komikcast.KomikcastChapterExtractor, + "#pattern" : r"https://svr\d+\.imgkc\d+\.my\.id/wp-content/img/A/Apotheosis/002-2/\d{3}\.jpg", + "#count" : 18, + + "chapter" : 2, + "chapter_minor": ".2", + "count" : 18, + "extension": "jpg", + "filename" : r"re:0\d{2}", + "lang" : "id", + "language" : "Indonesian", + "manga" : "Apotheosis", + "page" : range(1, 18), + "title" : "", +}, + { "#url" : "https://komikcast.site/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", "#category": ("", "komikcast", "chapter"), "#class" : komikcast.KomikcastChapterExtractor, - "#sha1_url" : "f6b43fbc027697749b3ea1c14931c83f878d7936", - "#sha1_metadata": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", +}, + +{ + "#url" : "https://komikcast.me/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", + "#category": ("", "komikcast", "chapter"), + "#class" : komikcast.KomikcastChapterExtractor, +}, + +{ + "#url" : "https://komikcast.com/chapter/apotheosis-chapter-02-2-bahasa-indonesia/", + "#category": ("", "komikcast", "chapter"), + "#class" : komikcast.KomikcastChapterExtractor, }, { "#url" : "https://komikcast.me/chapter/soul-land-ii-chapter-300-1-bahasa-indonesia/", "#category": ("", "komikcast", "chapter"), "#class" : komikcast.KomikcastChapterExtractor, - "#sha1_url" : "efd00a9bd95461272d51990d7bc54b79ff3ff2e6", + "#sha1_url" : "f2674e31b41a7f009f2f292652be2aefb6612d3f", "#sha1_metadata": "cb646cfed3d45105bd645ab38b2e9f7d8c436436", }, From c25bdbae91f172112b5be7f1ea926ed07ac0c370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 14:19:44 +0100 Subject: [PATCH 36/77] [komikcast] fix 'manga' extractor (#5027) --- gallery_dl/extractor/komikcast.py | 6 ++++-- test/results/komikcast.py | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 53411a2e..7a19be50 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -76,8 +76,10 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): for item in text.extract_iter( page, '<a class="chapter-link-item" href="', '</a'): - url, _, chapter_string = item.rpartition('">Chapter ') - self.parse_chapter_string(chapter_string, data) + url, _, chapter = item.rpartition('">Chapter') + chapter, sep, minor = chapter.strip().partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor results.append((url, data.copy())) return results diff --git a/test/results/komikcast.py b/test/results/komikcast.py index 89fcbf10..fa35c95f 100644 --- a/test/results/komikcast.py +++ b/test/results/komikcast.py @@ -57,8 +57,22 @@ __tests__ = ( "#url" : "https://komikcast.site/komik/090-eko-to-issho/", "#category": ("", "komikcast", "manga"), "#class" : komikcast.KomikcastMangaExtractor, - "#sha1_url" : "19d3d50d532e84be6280a3d61ff0fd0ca04dd6b4", - "#sha1_metadata": "837a7e96867344ff59d840771c04c20dc46c0ab1", + "#pattern" : komikcast.KomikcastChapterExtractor.pattern, + "#count" : 12, + + "author" : "Asakura Maru", + "chapter": range(1, 12), + "chapter_minor": "", + "genres" : [ + "Comedy", + "Drama", + "Romance", + "School Life", + "Sci-Fi", + "Shounen" + ], + "manga" : "090 Eko to Issho", + "type" : "Manga", }, { From cbfb7bfdf175d29beb655c2e96107956c2df346b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 14:26:46 +0100 Subject: [PATCH 37/77] [gelbooru] display error for invalid API responses (#4903) --- gallery_dl/extractor/gelbooru.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index d9da7bc3..eba15390 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -32,10 +32,13 @@ class GelbooruBase(): url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() - if key not in data: - return () + try: + posts = data[key] + except KeyError: + self.log.error("Incomplete API response (missing '%s')", key) + self.log.debug("%s", data) + return [] - posts = data[key] if not isinstance(posts, list): return (posts,) return posts From d0d199414f2eb77c5e19fa103740dbbccf015568 Mon Sep 17 00:00:00 2001 From: Se AKi <seaki@sastudio.jp> Date: Sat, 6 Jan 2024 23:15:15 +0900 Subject: [PATCH 38/77] modify useragent of pixiv --- gallery_dl/extractor/pixiv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 4414c71c..b9821f23 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -826,9 +826,9 @@ class PixivAppAPI(): extractor.session.headers.update({ "App-OS" : "ios", - "App-OS-Version": "13.1.2", - "App-Version" : "7.7.6", - "User-Agent" : "PixivIOSApp/7.7.6 (iOS 13.1.2; iPhone11,8)", + "App-OS-Version": "16.7.2", + "App-Version" : "7.19.1", + "User-Agent" : "PixivIOSApp/7.19.1 (iOS 16.7.2; iPhone12,8)", "Referer" : "https://app-api.pixiv.net/", }) From 6e10260fb071ce7625e79c0e59d8c004c29b501e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sat, 6 Jan 2024 17:48:58 +0100 Subject: [PATCH 39/77] release version 1.26.6 --- CHANGELOG.md | 32 ++++++++++++++++++++++++++++++++ README.rst | 4 ++-- gallery_dl/version.py | 2 +- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8907e07b..7b135b74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,37 @@ # Changelog +## 1.26.6 - 2024-01-06 +### Extractors +#### Additions +- [batoto] add `chapter` and `manga` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434), [#2111](https://github.com/mikf/gallery-dl/issues/2111), [#4979](https://github.com/mikf/gallery-dl/issues/4979)) +- [deviantart] add `avatar` and `background` extractors ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [poringa] add support ([#4675](https://github.com/mikf/gallery-dl/issues/4675), [#4962](https://github.com/mikf/gallery-dl/issues/4962)) +- [szurubooru] support `snootbooru.com` ([#5023](https://github.com/mikf/gallery-dl/issues/5023)) +- [zzup] add `gallery` extractor ([#4517](https://github.com/mikf/gallery-dl/issues/4517), [#4604](https://github.com/mikf/gallery-dl/issues/4604), [#4659](https://github.com/mikf/gallery-dl/issues/4659), [#4863](https://github.com/mikf/gallery-dl/issues/4863), [#5016](https://github.com/mikf/gallery-dl/issues/5016)) +#### Fixes +- [gelbooru] fix `favorite` extractor ([#4903](https://github.com/mikf/gallery-dl/issues/4903)) +- [idolcomplex] fix extraction & update URL patterns ([#5002](https://github.com/mikf/gallery-dl/issues/5002)) +- [imagechest] fix loading more than 10 images in a gallery ([#4469](https://github.com/mikf/gallery-dl/issues/4469)) +- [jpgfish] update domain +- [komikcast] fix `manga` extractor ([#5027](https://github.com/mikf/gallery-dl/issues/5027)) +- [komikcast] update domain ([#5027](https://github.com/mikf/gallery-dl/issues/5027)) +- [lynxchan] update `bbw-chan` domain ([#4970](https://github.com/mikf/gallery-dl/issues/4970)) +- [manganelo] fix extraction & recognize `.to` TLDs ([#5005](https://github.com/mikf/gallery-dl/issues/5005)) +- [paheal] restore `extension` metadata ([#4976](https://github.com/mikf/gallery-dl/issues/4976)) +- [rule34us] add fallback for `video-cdn1` videos ([#4985](https://github.com/mikf/gallery-dl/issues/4985)) +- [weibo] fix AttributeError in `user` extractor ([#5022](https://github.com/mikf/gallery-dl/issues/5022)) +#### Improvements +- [gelbooru] show error for invalid API responses ([#4903](https://github.com/mikf/gallery-dl/issues/4903)) +- [rule34] recognize URLs with `www` subdomain ([#4984](https://github.com/mikf/gallery-dl/issues/4984)) +- [twitter] raise error for invalid `strategy` values ([#4953](https://github.com/mikf/gallery-dl/issues/4953)) +#### Metadata +- [fanbox] add `metadata` option ([#4921](https://github.com/mikf/gallery-dl/issues/4921)) +- [nijie] add `count` metadata ([#146](https://github.com/mikf/gallery-dl/issues/146)) +- [pinterest] add `count` metadata ([#4981](https://github.com/mikf/gallery-dl/issues/4981)) +### Miscellaneous +- fix and update zsh completion ([#4972](https://github.com/mikf/gallery-dl/issues/4972)) +- fix `--cookies-from-browser` macOS Firefox profile path + ## 1.26.5 - 2023-12-23 ### Extractors #### Additions diff --git a/README.rst b/README.rst index 782c8ded..ee165e52 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.6/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d4ab3f64..15905d6b 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.6-dev" +__version__ = "1.26.6" From db8de135376d5c55ed685518024ed827161ebbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 7 Jan 2024 00:12:52 +0100 Subject: [PATCH 40/77] [vk] transform image URLs to non-blurred versions (#5017) apply the same filter from before d85e66bc --- gallery_dl/extractor/vk.py | 8 +++++++- gallery_dl/version.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index c22e67e6..95eeafe8 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +import re BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" @@ -24,6 +25,7 @@ class VkExtractor(Extractor): request_interval = (0.5, 1.5) def items(self): + sub = re.compile(r"/imp[fg]/").sub sizes = "wzyxrqpo" data = self.metadata() @@ -40,11 +42,15 @@ class VkExtractor(Extractor): continue try: - photo["url"] = photo[size + "src"] + url = photo[size + "src"] except KeyError: self.log.warning("no photo URL found (%s)", photo.get("id")) continue + photo["url"] = sub("/", url.partition("?")[0]) + # photo["url"] = url + photo["_fallback"] = (url,) + try: _, photo["width"], photo["height"] = photo[size] except ValueError: diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 15905d6b..d348b548 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.6" +__version__ = "1.26.7-dev" From 33f228756ace7efe282c924dbab4fb1c5801283a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 7 Jan 2024 02:59:35 +0100 Subject: [PATCH 41/77] [mangadex] add 'list' extractor (#5025) supports listing manga and chapters from list feed --- docs/supportedsites.md | 2 +- gallery_dl/extractor/mangadex.py | 31 +++++++++++++++++++++++++++++++ test/results/mangadex.py | 27 +++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d046aad4..057515c9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -502,7 +502,7 @@ Consider all listed sites to potentially be NSFW. <tr> <td>MangaDex</td> <td>https://mangadex.org/</td> - <td>Chapters, Followed Feed, Manga</td> + <td>Chapters, Followed Feed, Lists, Manga</td> <td>Supported</td> </tr> <tr> diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 94bea570..d287d5cf 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -148,6 +148,31 @@ class MangadexFeedExtractor(MangadexExtractor): return self.api.user_follows_manga_feed() +class MangadexListExtractor(MangadexExtractor): + """Extractor for mangadex lists""" + subcategory = "list" + pattern = (BASE_PATTERN + + r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?") + example = ("https://mangadex.org/list" + "/01234567-89ab-cdef-0123-456789abcdef/NAME") + + def __init__(self, match): + MangadexExtractor.__init__(self, match) + if match.group(2) != "feed": + self.subcategory = "list-feed" + self.items = self._items_titles + + def chapters(self): + return self.api.list_feed(self.uuid) + + def _items_titles(self): + data = {"_extractor": MangadexMangaExtractor} + for item in self.api.list(self.uuid)["relationships"]: + if item["type"] == "manga": + url = "{}/title/{}".format(self.root, item["id"]) + yield Message.Queue, url, data + + class MangadexAPI(): """Interface for the MangaDex API v5 @@ -173,6 +198,12 @@ class MangadexAPI(): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] + def list(self, uuid): + return self._call("/list/" + uuid)["data"] + + def list_feed(self, uuid): + return self._pagination("/list/" + uuid + "/feed") + @memcache(keyarg=1) def manga(self, uuid): params = {"includes[]": ("artist", "author")} diff --git a/test/results/mangadex.py b/test/results/mangadex.py index 17b2157c..ae1c7ab1 100644 --- a/test/results/mangadex.py +++ b/test/results/mangadex.py @@ -113,4 +113,31 @@ __tests__ = ( "#class" : mangadex.MangadexFeedExtractor, }, +{ + "#url" : "https://mangadex.org/list/3a0982c5-65aa-4de2-8a4a-2175be7383ab/test", + "#category": ("", "mangadex", "list"), + "#class" : mangadex.MangadexListExtractor, + "#urls" : ( + "https://mangadex.org/title/cba4e5d6-67a0-47a0-b37a-c06e9bf25d93", + "https://mangadex.org/title/cad76ec6-ca22-42f6-96f8-eca164da6545", + ), +}, + +{ + "#url" : "https://mangadex.org/list/3a0982c5-65aa-4de2-8a4a-2175be7383ab/test?tab=titles", + "#category": ("", "mangadex", "list"), + "#class" : mangadex.MangadexListExtractor, +}, + +{ + "#url" : "https://mangadex.org/list/3a0982c5-65aa-4de2-8a4a-2175be7383ab/test?tab=feed", + "#category": ("", "mangadex", "list-feed"), + "#class" : mangadex.MangadexListExtractor, + "#urls" : ( + "https://mangadex.org/chapter/c765d6d5-5712-4360-be0b-0c8e0914fc94", + "https://mangadex.org/chapter/fa8a695d-260f-4dcc-95a3-1f30e66d6571", + "https://mangadex.org/chapter/788766b9-41c6-422e-97ba-552f03ba9655", + ), +}, + ) From 657ed93a22a7eeb2e45c5a7787fb1e8069576951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 7 Jan 2024 22:23:30 +0100 Subject: [PATCH 42/77] [batoto] improve v2 manga URL pattern and add tests --- gallery_dl/extractor/batoto.py | 7 +-- test/results/batoto.py | 87 ++++++++++++++++++++++++++++++---- 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index cd6302e6..9cc6494a 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -11,7 +11,7 @@ from .. import text, exception import re BASE_PATTERN = (r"(?:https?://)?" - r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)") + r"(?:(?:ba|d|w)to\.to|(?:batotoo|mangatoto)\.com)") class BatotoBase(): @@ -76,12 +76,13 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): """Extractor for bato.to manga""" reverse = False chapterclass = BatotoChapterExtractor - pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$" + pattern = (BASE_PATTERN + + r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$") example = "https://bato.to/title/12345-MANGA/" def __init__(self, match): self.root = text.root_from_url(match.group(0)) - self.manga_id = match.group(1) + self.manga_id = match.group(1) or match.group(2) url = "{}/title/{}".format(self.root, self.manga_id) MangaExtractor.__init__(self, match, url) diff --git a/test/results/batoto.py b/test/results/batoto.py index f3853247..d61f7c87 100644 --- a/test/results/batoto.py +++ b/test/results/batoto.py @@ -14,10 +14,21 @@ __tests__ = ( "#class" : batoto.BatotoChapterExtractor, "#count" : 66, - "manga" : "I Shall Master this Family! [Official]", - "title" : "Observing", "chapter" : 8, + "chapter_id" : 1681030, + "chapter_minor": "", + "count" : 66, + "date" : "dt:2021-05-15 18:51:37", + "extension" : "webp", + "filename" : str, + "manga" : "I Shall Master this Family! [Official]", + "manga_id" : 86408, + "page" : range(1, 66), + "title" : "Observing", + "volume" : 0, + }, + { "#url" : "https://bato.to/title/104929-86-eighty-six-official/1943513-vol_1-ch_5", "#comment" : "volume (vol) in url", @@ -25,19 +36,25 @@ __tests__ = ( "#class" : batoto.BatotoChapterExtractor, "#count" : 7, - "manga" : "86--EIGHTY-SIX (Official)", - "title" : "The Spearhead Squadron's Power", - "volume" : 1, - "chapter" : 5, + "manga" : "86--EIGHTY-SIX (Official)", + "title" : "The Spearhead Squadron's Power", + "volume" : 1, + "chapter": 5, }, + { "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", "#category": ("", "batoto", "manga"), "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 21", + "chapter" : int, + "chapter_minor": str, + "date" : "type:datetime", "manga" : "Futsutsuka na Akujo de wa Gozaimasu ga - Suuguu Chouso Torikae Den (Official)", + "manga_id" : 113742, }, + { "#url" : "https://bato.to/title/104929-86-eighty-six-official", "#comment" : "Manga with number in name", @@ -45,8 +62,9 @@ __tests__ = ( "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 18", - "manga" : "86--EIGHTY-SIX (Official)", + "manga": "86--EIGHTY-SIX (Official)", }, + { "#url" : "https://bato.to/title/140046-the-grand-duke-s-fox-princess-mgchan", "#comment" : "Non-English translation (Indonesian)", @@ -54,12 +72,63 @@ __tests__ = ( "#class" : batoto.BatotoMangaExtractor, "#count" : ">= 29", - "manga" : "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", + "manga": "The Grand Duke’s Fox Princess ⎝⎝MGCHAN⎠⎠", }, + { "#url" : "https://bato.to/title/134270-removed", + "#comment" : "Deleted/removed manga", "#category": ("", "batoto", "manga"), "#class" : batoto.BatotoMangaExtractor, "#exception": exception.StopExtraction, -} +}, + +{ + "#url" : "https://bato.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://bato.to/chapter/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://dto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://wto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://batotoo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://mangatoto.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, +}, + +{ + "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, +}, + ) From 61f3b2f820f4687837e10fa9b067782807d49a4c Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 01:29:47 +1100 Subject: [PATCH 43/77] [hatenablog] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/hatenablog.py | 167 +++++++++++++++++++++++++++++ scripts/supportedsites.py | 7 +- test/results/hatenablog.py | 144 +++++++++++++++++++++++++ 5 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 gallery_dl/extractor/hatenablog.py create mode 100644 test/results/hatenablog.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d046aad4..188d8294 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW. <td>Folders</td> <td></td> </tr> +<tr> + <td>HatenaBlog</td> + <td>https://hatenablog.com</td> + <td>Archive, Individual Posts, Home Feed, Search Results</td> + <td></td> +</tr> <tr> <td>HBrowse</td> <td>https://www.hbrowse.com/</td> diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3..26ce2093 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -53,6 +53,7 @@ modules = [ "gelbooru_v01", "gelbooru_v02", "gofile", + "hatenablog", "hbrowse", "hentai2read", "hentaicosplays", diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 00000000..59e2f94e --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.com|hatenablog\.jp" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenaBlogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + + self.domain = match.group(1) or match.group(2) + self._find_img = re.compile(r'<img +(.+?) */?>').finditer + self._is_image = re.compile( + r'(?: |^)class="hatena-fotolife"(?: |$)').search + self._find_img_src = re.compile(r'(?: |^)src="(.+?)"(?: |$)').search + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('<time datetime="', '"')) + entry_link = text.unescape(extr( + '<a href="', '" class="entry-title-link bookmark">')) + entry = entry_link.partition("/entry/")[2] + title = extr('', '</a>') + content = extr( + '<div class="entry-content hatenablog-entry">', '</div>') + + images = [] + for i in self._find_img(content): + attributes = i.group(1) + if not self._is_image(attributes): + continue + image = text.unescape(self._find_img_src(attributes).group(1)) + images.append(image) + + data = { + "domain": self.domain, + "date": date, + "entry": entry, + "title": title, + "count": len(images), + } + yield Message.Directory, data + for data["num"], url in enumerate(images, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + +class HatenaBlogEntriesExtractor(HatenaBlogExtractor): + """Base class for a list of entries""" + allowed_parameters = () + + def __init__(self, match): + HatenaBlogExtractor.__init__(self, match) + self.path = match.group(3) + self.query = {key: value for key, value in text.parse_query( + match.group(4)).items() if self._acceptable_query(key)} + self._find_pager_url = re.compile( + r'<span class="pager-next">\s*<a href="(.+?)"').search + + def items(self): + url = "https://" + self.domain + self.path + query = self.query + + while url: + page = self.request(url, params=query).text + + extr = text.extract_from(page) + attributes = extr('<body ', '>') + if "page-archive" in attributes: + yield from self._handle_partial_articles(extr) + else: + yield from self._handle_full_articles(extr) + + match = self._find_pager_url(page) + url = text.unescape(match.group(1)) if match else None + query = None + + def _handle_partial_articles(self, extr): + while True: + section = extr('<section class="archive-entry', '</section>') + if not section: + break + + url = "hatenablog:" + text.unescape(text.extr(section, + '<a class="entry-title-link" href="', '"')) + data = {"_extractor": HatenaBlogEntryExtractor} + yield Message.Queue, url, data + + def _handle_full_articles(self, extr): + while True: + attributes = extr('<article ', '>') + if not attributes: + break + if "no-entry" in attributes: + continue + + article = extr('', '</article>') + yield from self._handle_article(article) + + def _acceptable_query(self, key: str) -> bool: + return key == "page" or key in self.allowed_parameters + + +class HatenaBlogEntryExtractor(HatenaBlogExtractor): + """Extractor for a single entry URL""" + subcategory = "entry" + pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE + example = "https://BLOG.hatenablog.com/entry/PATH" + + def __init__(self, match): + HatenaBlogExtractor.__init__(self, match) + self.path = match.group(3) + + def items(self): + url = "https://" + self.domain + "/entry/" + self.path + page = self.request(url).text + + extr = text.extract_from(page) + while True: + attributes = extr('<article ', '>') + if "no-entry" in attributes: + continue + article = extr('', '</article>') + return self._handle_article(article) + + +class HatenaBlogHomeExtractor(HatenaBlogEntriesExtractor): + """Extractor for a blog's home page""" + subcategory = "home" + pattern = BASE_PATTERN + r"(/?)" + QUERY_RE + example = "https://BLOG.hatenablog.com" + + +class HatenaBlogArchiveExtractor(HatenaBlogEntriesExtractor): + """Extractor for a blog's archive page""" + subcategory = "archive" + pattern = BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + \ + r"|/category/[^?#]+)?)" + QUERY_RE + example = "https://BLOG.hatenablog.com/archive/2024" + + +class HatenaBlogSearchExtractor(HatenaBlogEntriesExtractor): + """Extractor for a blog's search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/search)" + QUERY_RE + example = "https://BLOG.hatenablog.com/search?q=QUERY" + allowed_parameters = ("q",) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 798a6830..d29001c7 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -50,6 +50,7 @@ CATEGORY_MAP = { "fanbox" : "pixivFANBOX", "fashionnova" : "Fashion Nova", "furaffinity" : "Fur Affinity", + "hatenablog" : "HatenaBlog", "hbrowse" : "HBrowse", "hentai2read" : "Hentai2Read", "hentaicosplays" : "Hentai Cosplay", @@ -102,7 +103,6 @@ CATEGORY_MAP = { "pornimagesxxx" : "Porn Image", "pornpics" : "PornPics.com", "pornreactor" : "PornReactor", - "postmill" : "Postmill", "readcomiconline": "Read Comic Online", "rbt" : "RebeccaBlackTech", "redgifs" : "RedGIFs", @@ -189,6 +189,11 @@ SUBCATEGORY_MAP = { "fapello": { "path": "Videos, Trending Posts, Popular Videos, Top Models", }, + "hatenablog": { + "archive": "Archive", + "entry" : "Individual Posts", + "home" : "Home Feed", + }, "hentaifoundry": { "story": "", }, diff --git a/test/results/hatenablog.py b/test/results/hatenablog.py new file mode 100644 index 00000000..8ca7876f --- /dev/null +++ b/test/results/hatenablog.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import hatenablog + + +__tests__ = ( +{ + "#url" : "https://cosmiclatte.hatenablog.com/entry/2020/05/28/003227", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, + "#count" : 20, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/entry/2023/12/31/083846", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/entry/20231227/1703685600", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/entry/2ndlife", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/entry/2023/12/22/133549", + "#category": ("", "hatenablog", "entry"), + "#class" : hatenablog.HatenaBlogEntryExtractor, +}, + +{ + "#url" : "https://cetriolo.hatenablog.com", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, + "#range" : "1-7", + "#count" : 7, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/", + "#category": ("", "hatenablog", "home"), + "#class" : hatenablog.HatenaBlogHomeExtractor, +}, + +{ + "#url" : ("https://8saki.hatenablog.com/archive/category/%E3%82%BB%E3" + "%83%AB%E3%83%95%E3%82%B8%E3%82%A7%E3%83%AB%E3%83%8D%E3%82" + "%A4%E3%83%AB"), + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#range" : "1-30", + "#count" : 30, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/archive/2023", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#count" : 13, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/archive/2023/01", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#count" : 5, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/archive", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#range" : "1-30", + "#count" : 30, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/archive/2024/01/01", + "#category": ("", "hatenablog", "archive"), + "#class" : hatenablog.HatenaBlogArchiveExtractor, +}, + +{ + "#url" : "hatenablog:https://blog.hyouhon.com/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, + "#range" : "1-30", + "#count" : 30, +}, + +{ + "#url" : "https://cosmiclatte.hatenablog.com/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +{ + "#url" : "https://moko0908.hatenablog.jp/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +{ + "#url" : "https://p-shirokuma.hatenadiary.com/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +{ + "#url" : "https://urakatahero.hateblo.jp/search?q=a", + "#category": ("", "hatenablog", "search"), + "#class" : hatenablog.HatenaBlogSearchExtractor, +}, + +) From be6949c55d994d4a62d783d20c3a9d92bc81a53a Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 01:36:52 +1100 Subject: [PATCH 44/77] [hatenablog] fix linting error --- gallery_dl/extractor/hatenablog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 59e2f94e..322f2ca5 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -101,8 +101,8 @@ class HatenaBlogEntriesExtractor(HatenaBlogExtractor): if not section: break - url = "hatenablog:" + text.unescape(text.extr(section, - '<a class="entry-title-link" href="', '"')) + url = "hatenablog:" + text.unescape(text.extr( + section, '<a class="entry-title-link" href="', '"')) data = {"_extractor": HatenaBlogEntryExtractor} yield Message.Queue, url, data From 2cfe788f936a532784e66e7906dfb54c7c678e1f Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 01:42:57 +1100 Subject: [PATCH 45/77] [hatenablog] fix extractor naming errors --- gallery_dl/extractor/hatenablog.py | 18 +++++++------- test/results/hatenablog.py | 40 +++++++++++++++--------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 322f2ca5..dd1e45a5 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -19,7 +19,7 @@ BASE_PATTERN = ( QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" -class HatenaBlogExtractor(Extractor): +class HatenablogExtractor(Extractor): """Base class for HatenaBlog extractors""" category = "hatenablog" directory_fmt = ("{category}", "{domain}") @@ -65,12 +65,12 @@ class HatenaBlogExtractor(Extractor): yield Message.Url, url, text.nameext_from_url(url, data) -class HatenaBlogEntriesExtractor(HatenaBlogExtractor): +class HatenablogEntriesExtractor(HatenablogExtractor): """Base class for a list of entries""" allowed_parameters = () def __init__(self, match): - HatenaBlogExtractor.__init__(self, match) + HatenablogExtractor.__init__(self, match) self.path = match.group(3) self.query = {key: value for key, value in text.parse_query( match.group(4)).items() if self._acceptable_query(key)} @@ -103,7 +103,7 @@ class HatenaBlogEntriesExtractor(HatenaBlogExtractor): url = "hatenablog:" + text.unescape(text.extr( section, '<a class="entry-title-link" href="', '"')) - data = {"_extractor": HatenaBlogEntryExtractor} + data = {"_extractor": HatenablogEntryExtractor} yield Message.Queue, url, data def _handle_full_articles(self, extr): @@ -121,14 +121,14 @@ class HatenaBlogEntriesExtractor(HatenaBlogExtractor): return key == "page" or key in self.allowed_parameters -class HatenaBlogEntryExtractor(HatenaBlogExtractor): +class HatenablogEntryExtractor(HatenablogExtractor): """Extractor for a single entry URL""" subcategory = "entry" pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE example = "https://BLOG.hatenablog.com/entry/PATH" def __init__(self, match): - HatenaBlogExtractor.__init__(self, match) + HatenablogExtractor.__init__(self, match) self.path = match.group(3) def items(self): @@ -144,14 +144,14 @@ class HatenaBlogEntryExtractor(HatenaBlogExtractor): return self._handle_article(article) -class HatenaBlogHomeExtractor(HatenaBlogEntriesExtractor): +class HatenablogHomeExtractor(HatenablogEntriesExtractor): """Extractor for a blog's home page""" subcategory = "home" pattern = BASE_PATTERN + r"(/?)" + QUERY_RE example = "https://BLOG.hatenablog.com" -class HatenaBlogArchiveExtractor(HatenaBlogEntriesExtractor): +class HatenablogArchiveExtractor(HatenablogEntriesExtractor): """Extractor for a blog's archive page""" subcategory = "archive" pattern = BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + \ @@ -159,7 +159,7 @@ class HatenaBlogArchiveExtractor(HatenaBlogEntriesExtractor): example = "https://BLOG.hatenablog.com/archive/2024" -class HatenaBlogSearchExtractor(HatenaBlogEntriesExtractor): +class HatenablogSearchExtractor(HatenablogEntriesExtractor): """Extractor for a blog's search results""" subcategory = "search" pattern = BASE_PATTERN + r"(/search)" + QUERY_RE diff --git a/test/results/hatenablog.py b/test/results/hatenablog.py index 8ca7876f..4a306f9a 100644 --- a/test/results/hatenablog.py +++ b/test/results/hatenablog.py @@ -11,38 +11,38 @@ __tests__ = ( { "#url" : "https://cosmiclatte.hatenablog.com/entry/2020/05/28/003227", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, "#count" : 20, }, { "#url" : "https://moko0908.hatenablog.jp/entry/2023/12/31/083846", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "https://p-shirokuma.hatenadiary.com/entry/20231227/1703685600", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "https://urakatahero.hateblo.jp/entry/2ndlife", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "hatenablog:https://blog.hyouhon.com/entry/2023/12/22/133549", "#category": ("", "hatenablog", "entry"), - "#class" : hatenablog.HatenaBlogEntryExtractor, + "#class" : hatenablog.HatenablogEntryExtractor, }, { "#url" : "https://cetriolo.hatenablog.com", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, "#range" : "1-7", "#count" : 7, }, @@ -50,25 +50,25 @@ __tests__ = ( { "#url" : "https://moko0908.hatenablog.jp/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { "#url" : "https://p-shirokuma.hatenadiary.com/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { "#url" : "https://urakatahero.hateblo.jp/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { "#url" : "hatenablog:https://blog.hyouhon.com/", "#category": ("", "hatenablog", "home"), - "#class" : hatenablog.HatenaBlogHomeExtractor, + "#class" : hatenablog.HatenablogHomeExtractor, }, { @@ -76,7 +76,7 @@ __tests__ = ( "%83%AB%E3%83%95%E3%82%B8%E3%82%A7%E3%83%AB%E3%83%8D%E3%82" "%A4%E3%83%AB"), "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#range" : "1-30", "#count" : 30, }, @@ -84,21 +84,21 @@ __tests__ = ( { "#url" : "https://moko0908.hatenablog.jp/archive/2023", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#count" : 13, }, { "#url" : "https://p-shirokuma.hatenadiary.com/archive/2023/01", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#count" : 5, }, { "#url" : "https://urakatahero.hateblo.jp/archive", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, "#range" : "1-30", "#count" : 30, }, @@ -106,13 +106,13 @@ __tests__ = ( { "#url" : "hatenablog:https://blog.hyouhon.com/archive/2024/01/01", "#category": ("", "hatenablog", "archive"), - "#class" : hatenablog.HatenaBlogArchiveExtractor, + "#class" : hatenablog.HatenablogArchiveExtractor, }, { "#url" : "hatenablog:https://blog.hyouhon.com/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, "#range" : "1-30", "#count" : 30, }, @@ -120,25 +120,25 @@ __tests__ = ( { "#url" : "https://cosmiclatte.hatenablog.com/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, { "#url" : "https://moko0908.hatenablog.jp/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, { "#url" : "https://p-shirokuma.hatenadiary.com/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, { "#url" : "https://urakatahero.hateblo.jp/search?q=a", "#category": ("", "hatenablog", "search"), - "#class" : hatenablog.HatenaBlogSearchExtractor, + "#class" : hatenablog.HatenablogSearchExtractor, }, ) From ec958a26bc0f6664a8bf54bbb24412fdb49ada3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 8 Jan 2024 19:18:12 +0100 Subject: [PATCH 46/77] [fuskator] make metadata extraction non-fatal (#5039) - prevent KeyErrors - prevent HTTP redirect - return file URLs as list --- gallery_dl/extractor/fuskator.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index 20afb5a4..beecbff2 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -22,7 +22,7 @@ class FuskatorGalleryExtractor(GalleryExtractor): def __init__(self, match): self.gallery_hash = match.group(1) - url = "{}/thumbs/{}/".format(self.root, self.gallery_hash) + url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash) GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -50,15 +50,16 @@ class FuskatorGalleryExtractor(GalleryExtractor): "gallery_id" : text.parse_int(gallery_id), "gallery_hash": self.gallery_hash, "title" : text.unescape(title[:-15]), - "views" : data["hits"], - "score" : data["rating"], - "tags" : data["tags"].split(","), - "count" : len(data["images"]), + "views" : data.get("hits"), + "score" : data.get("rating"), + "tags" : (data.get("tags") or "").split(","), } def images(self, page): - for image in self.data["images"]: - yield "https:" + image["imageUrl"], image + return [ + ("https:" + image["imageUrl"], image) + for image in self.data["images"] + ] class FuskatorSearchExtractor(Extractor): From 2ccb7d3bd3f071c6923ca6eb9baedd196665d769 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 17:12:56 +1100 Subject: [PATCH 47/77] [steamgriddb] add support --- docs/configuration.rst | 170 ++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/steamgriddb.py | 210 ++++++++++++++++++++++++++++ test/results/steamgriddb.py | 124 ++++++++++++++++ 4 files changed, 505 insertions(+) create mode 100644 gallery_dl/extractor/steamgriddb.py create mode 100644 test/results/steamgriddb.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 8a1752ee..cfd67b3d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3076,6 +3076,176 @@ Description Download video files. +extractor.steamgriddb.animated +------------------------------ +Type + ``bool`` +Default + ``true`` +Description + Include animated assets when downloading from a list of assets. + + +extractor.steamgriddb.epilepsy +------------------------------ +Type + ``bool`` +Default + ``true`` +Description + Include assets tagged with epilepsy when downloading from a list of assets. + + +extractor.steamgriddb.dimensions +-------------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"all"`` +Examples + * ``"1024x512,512x512"`` + * ``["460x215", "920x430"]`` +Description + Only include assets that are in the specified dimensions. ``all`` can be + used to specify all dimensions. Valid values are: + + * Grids: ``460x215``, ``920x430``, ``600x900``, ``342x482``, ``660x930``, + ``512x512``, ``1024x1024`` + * Heroes: ``1920x620``, ``3840x1240``, ``1600x650`` + * Logos: N/A (will be ignored) + * Icons: ``8x8``, ``10x10``, ``14x14``, ``16x16``, ``20x20``, ``24x24``, + ``28x28``, ``32x32``, ``35x35``, ``40x40``, ``48x48``, ``54x54``, + ``56x56``, ``57x57``, ``60x60``, ``64x64``, ``72x72``, ``76x76``, + ``80x80``, ``90x90``, ``96x96``, ``100x100``, ``114x114``, ``120x120``, + ``128x128``, ``144x144``, ``150x150``, ``152x152``, ``160x160``, + ``180x180``, ``192x192``, ``194x194``, ``256x256``, ``310x310``, + ``512x512``, ``768x768``, ``1024x1024`` + + +extractor.steamgriddb.file-types +-------------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"all"`` +Examples + * ``"png,jpg"`` + * ``["jpeg", "webp"]`` +Description + Only include assets that are in the specified file types. ``all`` can be + used to specifiy all file types. Valid values are: + + * Grids: ``png``, ``jpeg``, ``jpg``, ``webp`` + * Heroes: ``png``, ``jpeg``, ``jpg``, ``webp`` + * Logos: ``png``, ``webp`` + * Icons: ``png``, ``ico`` + + +extractor.steamgriddb.download-fake-png +--------------------------------------- +Type + ``bool`` +Default + ``true`` +Description + Download fake PNGs alongside the real file. + + +extractor.steamgriddb.humor +--------------------------- +Type + ``bool`` +Default + ``true`` +Description + Include assets tagged with humor when downloading from a list of assets. + + +extractor.steamgriddb.languages +------------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"all"`` +Examples + * ``"en,km"`` + * ``["fr", "it"]`` +Description + Only include assets that are in the specified languages. ``all`` can be + used to specifiy all languages. Valid values are `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639-1>`__ + language codes. + + +extractor.steamgriddb.nsfw +-------------------------- +Type + ``bool`` +Default + ``true`` +Description + Include assets tagged with adult content when downloading from a list of assets. + + +extractor.steamgriddb.sort +-------------------------- +Type + ``string`` +Default + ``score_desc`` +Description + Set the chosen sorting method when downloading from a list of assets. Can be one of: + + * ``score_desc`` (Highest Score (Beta)) + * ``score_asc`` (Lowest Score (Beta)) + * ``score_old_desc`` (Highest Score (Old)) + * ``score_old_asc`` (Lowest Score (Old)) + * ``age_desc`` (Newest First) + * ``age_asc`` (Oldest First) + + +extractor.steamgriddb.static +---------------------------- +Type + ``bool`` +Default + ``true`` +Description + Include static assets when downloading from a list of assets. + + +extractor.steamgriddb.styles +---------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``all`` +Examples + * ``white,black`` + * ``["no_logo", "white_logo"]`` +Description + Only include assets that are in the specified styles. ``all`` can be used + to specify all styles. Valid values are: + + * Grids: ``alternate``, ``blurred``, ``no_logo``, ``material``, ``white_logo`` + * Heroes: ``alternate``, ``blurred``, ``material`` + * Logos: ``official``, ``white``, ``black``, ``custom`` + * Icons: ``official``, ``custom`` + + +extractor.steamgriddb.untagged +------------------------------ +Type + ``bool`` +Default + ``true`` +Description + Include untagged assets when downloading from a list of assets. + + extractor.[szurubooru].username & .token ---------------------------------------- Type diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3..be3ca649 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -145,6 +145,7 @@ modules = [ "smugmug", "soundgasm", "speakerdeck", + "steamgriddb", "subscribestar", "szurubooru", "tapas", diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py new file mode 100644 index 00000000..516c422b --- /dev/null +++ b/gallery_dl/extractor/steamgriddb.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.steamgriddb.com""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com" +LANGUAGE_CODES = ( + "aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az", + "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce", + "ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee", + "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr", + "fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr", + "ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is", + "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn", + "ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln", + "lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", + "mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv", + "ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu", + "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk", + "sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta", + "te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw", + "ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", + "yo", "za", "zh", "zu", +) +FILE_EXT_TO_MIME = { + "png": "image/png", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "webp": "image/webp", + "ico": "image/vnd.microsoft.icon", + "all": "all", +} + + +class SteamgriddbExtractor(Extractor): + """Base class for SteamGridDB""" + category = "steamgriddb" + directory_fmt = ("{category}", "{subcategory}", "{game[id]}") + filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}" + archive_fmt = "{filename}" + root = "https://www.steamgriddb.com" + + def _init(self): + self.cookies_update({ + "userprefs": "%7B%22adult%22%3Afalse%7D", + }) + + def items(self): + download_fake_png = self.config("download-fake-png", True) + + for asset in self.assets(): + urls = [asset["url"]] + if download_fake_png and asset.get("fake_png"): + urls.append(asset["fake_png"]) + + asset["count"] = len(urls) + yield Message.Directory, asset + for asset["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, asset) + + def _call(self, endpoint: str, **kwargs): + data = self.request(self.root + endpoint, **kwargs).json() + if not data["success"]: + raise exception.StopExtraction(data["error"]) + return data["data"] + + +class SteamgriddbAssetsExtractor(SteamgriddbExtractor): + """Base class for extracting a list of assets""" + + def __init__(self, match): + SteamgriddbExtractor.__init__(self, match) + list_type = match.group(1) + id = int(match.group(2)) + self.game_id = id if list_type == "game" else None + self.collection_id = id if list_type == "collection" else None + self.page = int(match.group(3) or 1) + + def assets(self): + limit = 48 + page = min(self.page - 1, 0) + + sort = self.config("sort", "score_desc") + if sort not in ("score_desc", "score_asc", "score_old_desc", + "score_old_asc", "age_desc", "age_asc"): + raise exception.StopExtractor("Invalid sort '%s'", sort) + + json = { + "static" : self.config("static", True), + "animated": self.config("animated", True), + "humor" : self.config("humor", True), + "nsfw" : self.config("nsfw", True), + "epilepsy": self.config("epilepsy", True), + "untagged": self.config("untagged", True), + + "asset_type": self.asset_type, + "limit": limit, + "order": sort, + } + if self.valid_dimensions: + json["dimensions"] = self.config_list( + "dimensions", "dimension", self.valid_dimensions) + json["styles"] = self.config_list("styles", "style", self.valid_styles) + json["languages"] = self.config_list( + "languages", "language", LANGUAGE_CODES) + file_types = self.config_list( + "file-types", "file type", self.valid_file_types) + json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types] + + if self.game_id: + json["game_id"] = [self.game_id] + else: + json["collection_id"] = self.collection_id + + while True: + json["page"] = page + + data = self._call( + "/api/public/search/assets", method="POST", json=json) + for asset in data["assets"]: + if not asset.get("game"): + asset["game"] = data["game"] + yield asset + + if data["total"] > limit * page: + page += 1 + else: + break + + def config_list(self, key, type_name, valid_values): + value = self.config(key, ["all"]) + if isinstance(value, str): + value = value.split(",") + + if "all" in value: + return ["all"] + + for i in value: + if i not in valid_values: + raise exception.StopExtraction("Invalid %s '%s'", type_name, i) + + return value + + +class SteamgriddbAssetExtractor(SteamgriddbExtractor): + """Extractor for a single asset""" + subcategory = "asset" + pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)" + example = "https://www.steamgriddb.com/grid/1234" + + def __init__(self, match): + SteamgriddbExtractor.__init__(self, match) + self.asset_type = match.group(1) + self.asset_id = match.group(2) + + def assets(self): + endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id + asset = self._call(endpoint)["asset"] + return (asset,) + + +class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): + subcategory = "grids" + asset_type = "grid" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/grids" + valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930", + "512x512", "1024x1024") + valid_styles = ("alternate", "blurred", "no_logo", "material", "white_logo") + valid_file_types = ("png", "jpeg", "jpg", "webp") + + +class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor): + subcategory = "heroes" + asset_type = "hero" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/heroes" + valid_dimensions = ("1920x620", "3840x1240", "1600x650") + valid_styles = ("alternate", "blurred", "material") + valid_file_types = ("png", "jpeg", "jpg", "webp") + + +class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor): + subcategory = "logos" + asset_type = "logo" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/logos" + valid_dimensions = None + valid_styles = ("official", "white", "black", "custom") + valid_file_types = ("png", "webp") + + +class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor): + subcategory = "icons" + asset_type = "icon" + pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?" + example = "https://www.steamgriddb.com/game/1234/icons" + valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24, + 28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90, + 96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192, + 194, 256, 310, 512, 768, 1024)] + valid_styles = ("official", "custom") + valid_file_types = ("png", "ico") diff --git a/test/results/steamgriddb.py b/test/results/steamgriddb.py new file mode 100644 index 00000000..06c1c22b --- /dev/null +++ b/test/results/steamgriddb.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import steamgriddb + + +__tests__ = ( +{ + "#url" : "https://www.steamgriddb.com/grid/368023", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, + "#urls" : ("https://cdn2.steamgriddb.com/grid/" + "82fee171d62c044898d99ba0fddeb203.png"), + "#count" : 1, + "#sha1_content": "0bffaccae6f35f9fab529684a5b158d1cec4186b", + + "game": { + "id" : 5259324, + "name": "Helltaker", + }, +}, + +{ + "#url" : "https://www.steamgriddb.com/grid/132605", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, + "#count" : 2, + "#sha1_url" : "4ff9158c008a1f01921d7553bcabf5e6204cdc79", + "#sha1_content": "bc16c5eebf71463abdb33cfbf4b45a2fe092a2b2", + + "game": { + "id" : 5247997, + "name": "OMORI", + }, +}, + +{ + "#url" : "https://www.steamgriddb.com/grid/132605", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, + "#options" : {"download-fake-png": False}, + "#count" : 1, + "#sha1_url" : "f6819c593ff65f15864796fb89581f05d21adddb", + "#sha1_content": "0d9e6114dd8bb9699182fbb7c6bd9064d8b0b6cd", + + "game": { + "id" : 5247997, + "name": "OMORI", + }, +}, + +{ + "#url" : "https://www.steamgriddb.com/hero/61104", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/logo/9610", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/icon/173", + "#category": ("", "steamgriddb", "asset"), + "#class" : steamgriddb.SteamgriddbAssetExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5259324/grids", + "#category": ("", "steamgriddb", "grids"), + "#class" : steamgriddb.SteamgriddbGridsExtractor, + "#range" : "1-10", + "#count" : 10, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5259324/grids", + "#category": ("", "steamgriddb", "grids"), + "#class" : steamgriddb.SteamgriddbGridsExtractor, + "#options" : {"humor": False, "epilepsy": False, "untagged": False}, + "#range" : "1-33", + "#count" : 33, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5331605/heroes", + "#category": ("", "steamgriddb", "heroes"), + "#class" : steamgriddb.SteamgriddbHeroesExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5255394/logos", + "#category": ("", "steamgriddb", "logos"), + "#class" : steamgriddb.SteamgriddbLogosExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/game/5279790/icons", + "#category": ("", "steamgriddb", "icons"), + "#class" : steamgriddb.SteamgriddbIconsExtractor, +}, + +{ + "#url" : "https://www.steamgriddb.com/collection/332/grids", + "#category": ("", "steamgriddb", "grids"), + "#class" : steamgriddb.SteamgriddbGridsExtractor, + "#range" : "1-10", + "#count" : 10, +}, + +{ + "#url" : "https://www.steamgriddb.com/collection/332/heroes", + "#category": ("", "steamgriddb", "heroes"), + "#class" : steamgriddb.SteamgriddbHeroesExtractor, + "#options" : {"animated": False}, + "#count" : 0, +}, + +) From 100966b122cd90ca139593cf8ff21fd0f777243a Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 17:18:31 +1100 Subject: [PATCH 48/77] [steamgriddb] fix linting error --- gallery_dl/extractor/steamgriddb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 516c422b..1f803ffd 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -173,7 +173,8 @@ class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor): example = "https://www.steamgriddb.com/game/1234/grids" valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930", "512x512", "1024x1024") - valid_styles = ("alternate", "blurred", "no_logo", "material", "white_logo") + valid_styles = ("alternate", "blurred", "no_logo", "material", + "white_logo") valid_file_types = ("png", "jpeg", "jpg", "webp") From 0c88373a219a646ab100a6ad89a42cb041cf1fad Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Tue, 9 Jan 2024 17:22:25 +1100 Subject: [PATCH 49/77] [docs] add steamgriddb to supportedsites.md --- docs/supportedsites.md | 6 ++++++ scripts/supportedsites.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 057515c9..92a4cee0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -811,6 +811,12 @@ Consider all listed sites to potentially be NSFW. <td>Presentations</td> <td></td> </tr> +<tr> + <td>SteamGridDB</td> + <td>https://www.steamgriddb.com</td> + <td>Individual Assets, Grids, Heroes, Icons, Logos</td> + <td></td> +</tr> <tr> <td>SubscribeStar</td> <td>https://www.subscribestar.com/</td> diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 798a6830..d4ce3eed 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -121,6 +121,7 @@ CATEGORY_MAP = { "slideshare" : "SlideShare", "smugmug" : "SmugMug", "speakerdeck" : "Speaker Deck", + "steamgriddb" : "SteamGridDB", "subscribestar" : "SubscribeStar", "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", @@ -262,6 +263,9 @@ SUBCATEGORY_MAP = { "smugmug": { "path": "Images from Users and Folders", }, + "steamgriddb": { + "asset": "Individual Assets", + }, "tumblr": { "day": "Days", }, From 0a382a5092d275658a6a32e454ec3ff800b8d853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 17:25:04 +0100 Subject: [PATCH 50/77] [batoto] improve 'manga_id' extraction (#5042) --- gallery_dl/extractor/batoto.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 9cc6494a..72b5b6e5 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -38,7 +38,8 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) manga, info, _ = extr("<title>", "<").rsplit(" - ", 3) - manga_id = extr("/title/", "/") + manga_id = text.extr( + extr('rel="canonical" href="', '"'), "/title/", "/") match = re.match( r"(?:Volume\s+(\d+) )?" From 887ade30a51edeea150fd1a95b33c86208319289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 18:02:49 +0100 Subject: [PATCH 51/77] [batoto] support more mirror domains (#5042) --- gallery_dl/extractor/batoto.py | 7 +- test/results/batoto.py | 140 +++++++++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 19 deletions(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 72b5b6e5..e82cd09f 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -10,8 +10,11 @@ from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, exception import re -BASE_PATTERN = (r"(?:https?://)?" - r"(?:(?:ba|d|w)to\.to|(?:batotoo|mangatoto)\.com)") +BASE_PATTERN = (r"(?:https?://)?(?:" + r"(?:ba|d|h|m|w)to\.to|" + r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|" + r"comiko\.(?:net|org)|" + r"bat(?:otoo|o?two)\.com)") class BatotoBase(): diff --git a/test/results/batoto.py b/test/results/batoto.py index d61f7c87..4992bda1 100644 --- a/test/results/batoto.py +++ b/test/results/batoto.py @@ -42,6 +42,19 @@ __tests__ = ( "chapter": 5, }, +{ + "#url" : "https://bato.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://bato.to/chapter/1681030", + "#comment" : "v2 URL", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + { "#url" : "https://bato.to/title/113742-futsutsuka-na-akujo-de-wa-gozaimasu-ga-suuguu-chouso-torikae-den-official", "#category": ("", "batoto", "manga"), @@ -84,15 +97,16 @@ __tests__ = ( }, { - "#url" : "https://bato.to/title/86408/1681030", - "#category": ("", "batoto", "chapter"), - "#class" : batoto.BatotoChapterExtractor, + "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, }, { - "#url" : "https://bato.to/chapter/1681030", - "#category": ("", "batoto", "chapter"), - "#class" : batoto.BatotoChapterExtractor, + "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", + "#comment" : "v2 URL", + "#category": ("", "batoto", "manga"), + "#class" : batoto.BatotoMangaExtractor, }, { @@ -100,15 +114,18 @@ __tests__ = ( "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, - { - "#url" : "https://wto.to/title/86408/1681030", + "#url" : "https://hto.to/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, - { - "#url" : "https://batotoo.com/title/86408/1681030", + "#url" : "https://mto.to/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://wto.to/title/86408/1681030", "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, @@ -118,17 +135,106 @@ __tests__ = ( "#category": ("", "batoto", "chapter"), "#class" : batoto.BatotoChapterExtractor, }, - { - "#url" : "https://bato.to/title/86408-i-shall-master-this-family-official", - "#category": ("", "batoto", "manga"), - "#class" : batoto.BatotoMangaExtractor, + "#url" : "https://mangatoto.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://mangatoto.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, }, { - "#url" : "https://bato.to/series/86408/i-shall-master-this-family-official", - "#category": ("", "batoto", "manga"), - "#class" : batoto.BatotoMangaExtractor, + "#url" : "https://batocomic.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://batocomic.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://batocomic.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://readtoto.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://readtoto.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://readtoto.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://xbato.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://xbato.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://xbato.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://zbato.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://zbato.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://zbato.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://comiko.net/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://comiko.org/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, + +{ + "#url" : "https://batotoo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://batotwo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, +}, +{ + "#url" : "https://battwo.com/title/86408/1681030", + "#category": ("", "batoto", "chapter"), + "#class" : batoto.BatotoChapterExtractor, }, ) From 5f9a98cf0fded6dac8efcc02b4f2cbc39ebc614a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 20:04:46 +0100 Subject: [PATCH 52/77] [deviantart:avatar] fix exception when 'comments' are enabled (#4995) --- gallery_dl/extractor/deviantart.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 4b5f1d77..32dedacf 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -558,6 +558,7 @@ class DeviantartAvatarExtractor(DeviantartExtractor): "is_downloadable": False, "published_time" : 0, "title" : "avatar", + "stats" : {"comments": 0}, "content" : { "src": url.replace("/avatars/", "/avatars-big/", 1), }, From 5c43098a1ae062456e040246459f063bc84eefff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 9 Jan 2024 23:19:39 +0100 Subject: [PATCH 53/77] [twitter] revert to using 'media' timeline by default (#4953) This reverts commit a94f9441487573ea84700936117f4535e78d32c0. --- docs/configuration.rst | 2 +- gallery_dl/extractor/twitter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 8a1752ee..ba3cc413 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3496,7 +3496,7 @@ Description * ``"tweets"``: `/tweets <https://twitter.com/USER/tweets>`__ timeline + search * ``"media"``: `/media <https://twitter.com/USER/media>`__ timeline + search * ``"with_replies"``: `/with_replies <https://twitter.com/USER/with_replies>`__ timeline + search - * ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets <extractor.twitter.retweets_>`__, `replies <extractor.twitter.replies_>`__, and `text-tweets <extractor.twitter.text-tweets_>`__ settings + * ``"auto"``: ``"tweets"`` or ``"media"``, depending on `retweets <extractor.twitter.retweets_>`__ and `text-tweets <extractor.twitter.text-tweets_>`__ settings extractor.twitter.text-tweets diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index aa9ab9f6..cf759e0f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -546,7 +546,7 @@ class TwitterTimelineExtractor(TwitterExtractor): def _select_tweet_source(self): strategy = self.config("strategy") if strategy is None or strategy == "auto": - if self.retweets or self.replies or self.textonly: + if self.retweets or self.textonly: return self.api.user_tweets else: return self.api.user_media From 39904c9e4eb2fe664ff5855bc2d3c2d749dcb690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 10 Jan 2024 17:13:34 +0100 Subject: [PATCH 54/77] [deviantart:avatar] add 'formats' option (#4995) --- docs/configuration.rst | 13 +++++++ gallery_dl/extractor/deviantart.py | 56 +++++++++++++++++++++--------- test/results/deviantart.py | 17 ++++++++- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index ba3cc413..00be43a7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1559,6 +1559,19 @@ Description Minimum wait time in seconds before API requests. +extractor.deviantart.avatar.formats +----------------------------------- +Type + ``list`` of ``strings`` +Example + ``["original.jpg", "big.jpg", "big.gif", ".png"]`` +Description + Avatar URL formats to return. + + | Each format is parsed as ``SIZE.EXT``. + | Leave ``SIZE`` empty to download the regular, small avatar format. + + extractor.[E621].metadata ------------------------- Type diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 32dedacf..7df1890e 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -547,23 +547,45 @@ class DeviantartAvatarExtractor(DeviantartExtractor): example = "https://www.deviantart.com/USER/avatar/" def deviations(self): - profile = self.api.user_profile(self.user.lower()) - if profile: - url = profile["user"]["usericon"] - return ({ - "author" : profile["user"], - "category" : "avatar", - "index" : text.parse_int(url.rpartition("?")[2]), - "is_deleted" : False, - "is_downloadable": False, - "published_time" : 0, - "title" : "avatar", - "stats" : {"comments": 0}, - "content" : { - "src": url.replace("/avatars/", "/avatars-big/", 1), - }, - },) - return () + name = self.user.lower() + profile = self.api.user_profile(name) + if not profile: + return () + + user = profile["user"] + icon = user["usericon"] + index = icon.rpartition("?")[2] + + formats = self.config("formats") + if not formats: + url = icon.replace("/avatars/", "/avatars-big/", 1) + return (self._make_deviation(url, user, index, ""),) + + if isinstance(formats, str): + formats = formats.replace(" ", "").split(",") + + results = [] + for fmt in formats: + fmt, _, ext = fmt.rpartition(".") + if fmt: + fmt = "-" + fmt + url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format( + fmt, name[0], name[1], name, ext, index) + results.append(self._make_deviation(url, user, index, fmt)) + return results + + def _make_deviation(self, url, user, index, fmt): + return { + "author" : user, + "category" : "avatar", + "index" : text.parse_int(index), + "is_deleted" : False, + "is_downloadable": False, + "published_time" : 0, + "title" : "avatar" + fmt, + "stats" : {"comments": 0}, + "content" : {"src": url}, + } class DeviantartBackgroundExtractor(DeviantartExtractor): diff --git a/test/results/deviantart.py b/test/results/deviantart.py index 45ee6c18..41cb3219 100644 --- a/test/results/deviantart.py +++ b/test/results/deviantart.py @@ -210,7 +210,7 @@ __tests__ = ( "#sha1_content": "abf2cc79b842315f2e54bfdd93bf794a0f612b6f", "author" : { - "type" : "premium", + "type" : "regular", "usericon": "https://a.deviantart.net/avatars/s/h/shimoda7.jpg?4", "userid" : "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B", "username": "shimoda7", @@ -237,6 +237,21 @@ __tests__ = ( "username" : "shimoda7", }, +{ + "#url" : "https://deviantart.com/shimoda7/avatar", + "#comment" : "'formats' option", + "#category": ("", "deviantart", "avatar"), + "#class" : deviantart.DeviantartAvatarExtractor, + "#archive" : False, + "#options" : {"formats": ["original.jpg", "big.jpg", "big.png", "big.gif"]}, + "#urls" : ( + "https://a.deviantart.net/avatars-original/s/h/shimoda7.jpg?4", + "https://a.deviantart.net/avatars-big/s/h/shimoda7.jpg?4", + "https://a.deviantart.net/avatars-big/s/h/shimoda7.png?4", + "https://a.deviantart.net/avatars-big/s/h/shimoda7.gif?4", + ), +}, + { "#url" : "https://deviantart.com/gdldev/banner", "#category": ("", "deviantart", "background"), From bbf96753e2ab8d02adb4682a5a2d607943914627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 10 Jan 2024 17:21:30 +0100 Subject: [PATCH 55/77] [gelbooru] only log "Incomplete API response" for favorites (#5045) --- gallery_dl/extractor/gelbooru.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index eba15390..e37b2e92 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -23,7 +23,7 @@ class GelbooruBase(): root = "https://gelbooru.com" offset = 0 - def _api_request(self, params, key="post"): + def _api_request(self, params, key="post", log=False): if "s" not in params: params["s"] = "post" params["api_key"] = self.api_key @@ -35,8 +35,9 @@ class GelbooruBase(): try: posts = data[key] except KeyError: - self.log.error("Incomplete API response (missing '%s')", key) - self.log.debug("%s", data) + if log: + self.log.error("Incomplete API response (missing '%s')", key) + self.log.debug("%s", data) return [] if not isinstance(posts, list): @@ -169,7 +170,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, "limit": "1", } - count = self._api_request(params, "@attributes")[0]["count"] + count = self._api_request(params, "@attributes", True)[0]["count"] if count <= self.offset: return @@ -186,7 +187,7 @@ class GelbooruFavoriteExtractor(GelbooruBase, params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite") + favs = self._api_request(params, "favorite", True) favs.reverse() if skip: From 2191e29e14ab138da8347744c993df0b40b85a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 10 Jan 2024 23:27:10 +0100 Subject: [PATCH 56/77] [nijie] fix image URL for single image posts (#5049) --- gallery_dl/extractor/nijie.py | 3 ++- test/results/nijie.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index b9917057..96145130 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -116,7 +116,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): yield from text.extract_iter( page, 'href="javascript:void(0);"><img src="', '"') else: - yield text.extr(page, 'itemprop="image" src="', '"') + pos = page.find('id="view-center"') + 1 + yield text.extract(page, 'itemprop="image" src="', '"', pos)[0] @staticmethod def _extract_user_name(page): diff --git a/test/results/nijie.py b/test/results/nijie.py index a2c05c81..1f86bcb1 100644 --- a/test/results/nijie.py +++ b/test/results/nijie.py @@ -157,6 +157,14 @@ __tests__ = ( "user_name" : "黒川 竜", }, +{ + "#url" : "https://nijie.info/view.php?id=37078", + "#comment" : "'view_side_dojin' thumbnails (#5049)", + "#category": ("Nijie", "nijie", "image"), + "#class" : nijie.NijieImageExtractor, + "#urls" : "https://pic.nijie.net/03/nijie/13/98/498/illust/0_0_703023d18ca8d058_bca943.jpg", +}, + { "#url" : "https://nijie.info/view.php?id=70724", "#category": ("Nijie", "nijie", "image"), From 1c68b7df010913cb661f06224bbbf7b610c79590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Thu, 11 Jan 2024 17:56:47 +0100 Subject: [PATCH 57/77] [patreon] fix KeyError (#5048) --- gallery_dl/extractor/patreon.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6c2f39dc..c175ab83 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -56,15 +56,16 @@ class PatreonExtractor(Extractor): else: self.log.debug("skipping %s (%s %s)", url, fhash, kind) - @staticmethod - def _postfile(post): + def _postfile(self, post): postfile = post.get("post_file") if postfile: - return (("postfile", postfile["url"], postfile["name"]),) + url = postfile["url"] + name = postfile.get("name") or self._filename(url) or url + return (("postfile", url, name),) return () def _images(self, post): - for image in post["images"]: + for image in post.get("images") or (): url = image.get("download_url") if url: name = image.get("file_name") or self._filename(url) or url @@ -80,7 +81,7 @@ class PatreonExtractor(Extractor): return () def _attachments(self, post): - for attachment in post["attachments"]: + for attachment in post.get("attachments") or (): url = self.request( attachment["url"], method="HEAD", allow_redirects=False, fatal=False, From 2dcfb012ea0b773d22d1898c7f28e6bf3fa90eed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 02:33:27 +0100 Subject: [PATCH 58/77] [patreon] download 'm3u8' manifests with ytdl --- gallery_dl/extractor/patreon.py | 6 +++++- test/results/patreon.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index c175ab83..dfcfe24b 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -52,7 +52,11 @@ class PatreonExtractor(Extractor): post["hash"] = fhash post["type"] = kind post["num"] += 1 - yield Message.Url, url, text.nameext_from_url(name, post) + text.nameext_from_url(name, post) + if text.ext_from_url(url) == "m3u8": + url = "ytdl:" + url + post["extension"] = "mp4" + yield Message.Url, url, post else: self.log.debug("skipping %s (%s %s)", url, fhash, kind) diff --git a/test/results/patreon.py b/test/results/patreon.py index d4557173..79c0a603 100644 --- a/test/results/patreon.py +++ b/test/results/patreon.py @@ -103,6 +103,14 @@ __tests__ = ( "tags": ["AWMedia"], }, +{ + "#url" : "https://www.patreon.com/posts/meu8-94714289", + "#category": ("", "patreon", "post"), + "#class" : patreon.PatreonPostExtractor, + "#range" : "2", + "#pattern" : r"ytdl:https://stream\.mux\.com/NLrxTLdxyGStpOgapJAtB8uPGAaokEcj8YovML00y2DY\.m3u8\?token=ey.+", +}, + { "#url" : "https://www.patreon.com/posts/not-found-123", "#category": ("", "patreon", "post"), From 58e0665fbcefe050d90e3b629bfb52559f9f7670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 03:21:44 +0100 Subject: [PATCH 59/77] [tests] load config from external file --- test/test_results.py | 72 +++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 48 deletions(-) diff --git a/test/test_results.py b/test/test_results.py index c7a50019..6b60e9d8 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -28,6 +28,16 @@ BROKEN = { "photobucket", } +CONFIG = { + "cache": { + "file": None, + }, + "downloader": { + "adjust-extensions": False, + "part": False, + }, +} + class TestExtractorResults(unittest.TestCase): @@ -348,56 +358,21 @@ class TestFormatter(formatter.StringFormatter): def setup_test_config(): - name = "gallerydl" - email = "gallerydl@openaliasbox.org" - email2 = "gallerydl@protonmail.com" + config._config.update(CONFIG) - config.clear() - config.set(("cache",), "file", None) - config.set(("downloader",), "part", False) - config.set(("downloader",), "adjust-extensions", False) - config.set(("extractor" ,), "timeout" , 60) - config.set(("extractor" ,), "username", name) - config.set(("extractor" ,), "password", name) - config.set(("extractor", "nijie") , "username", email) - config.set(("extractor", "seiga") , "username", email) - config.set(("extractor", "horne") , "username", email2) - config.set(("extractor", "pinterest") , "username", email2) - config.set(("extractor", "pinterest") , "username", None) # login broken - - config.set(("extractor", "newgrounds"), "username", "d1618111") - config.set(("extractor", "newgrounds"), "password", "d1618111") - - config.set(("extractor", "mangoxo") , "username", "LiQiang3") - config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") - - for category in ("danbooru", "atfbooru", "aibooru", "booruvar", - "e621", "e926", "e6ai", - "instagram", "twitter", "subscribestar", "deviantart", - "inkbunny", "tapas", "pillowfort", "mangadex", - "vipergirls"): - config.set(("extractor", category), "username", None) - - config.set(("extractor", "mastodon.social"), "access-token", - "Blf9gVqG7GytDTfVMiyYQjwVMQaNACgf3Ds3IxxVDUQ") - - config.set(("extractor", "nana"), "favkey", - "9237ddb82019558ea7d179e805100805" - "ea6aa1c53ca6885cd4c179f9fb22ead2") - - config.set(("extractor", "deviantart"), "client-id", "7777") - config.set(("extractor", "deviantart"), "client-secret", - "ff14994c744d9208e5caeec7aab4a026") - - config.set(("extractor", "tumblr"), "api-key", - "0cXoHfIqVzMQcc3HESZSNsVlulGxEXGDTTZCDrRrjaa0jmuTc6") - config.set(("extractor", "tumblr"), "api-secret", - "6wxAK2HwrXdedn7VIoZWxGqVhZ8JdYKDLjiQjL46MLqGuEtyVj") - config.set(("extractor", "tumblr"), "access-token", - "N613fPV6tOZQnyn0ERTuoEZn0mEqG8m2K8M3ClSJdEHZJuqFdG") - config.set(("extractor", "tumblr"), "access-token-secret", - "sgOA7ZTT4FBXdOGGVV331sSp0jHYp4yMDRslbhaQf7CaS71i4O") +def load_test_config(): + try: + path = os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "archive", "config.json") + with open(path) as fp: + CONFIG.update(json.loads(fp.read())) + except FileNotFoundError: + pass + except Exception as exc: + print("Error when loading {}: {}: {}".format( + path, exc.__class__.__name__, exc)) def generate_tests(): @@ -446,6 +421,7 @@ def generate_tests(): setattr(TestExtractorResults, method.__name__, method) +load_test_config() generate_tests() if __name__ == "__main__": unittest.main(warnings="ignore") From b97af09e03ada519aeed0a6f723fd2e733732811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 03:23:21 +0100 Subject: [PATCH 60/77] [tests] include URL in failure report --- test/test_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_results.py b/test/test_results.py index 6b60e9d8..575fc0f3 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -417,6 +417,7 @@ def generate_tests(): enum[name] += 1 method = _generate_method(result) + method.__doc__ = result["#url"] method.__name__ = "test_{}_{}".format(name, enum[name]) setattr(TestExtractorResults, method.__name__, method) From b1c175fdd1a5f5258d0f6aace5d3639446847e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Fri, 12 Jan 2024 16:38:18 +0100 Subject: [PATCH 61/77] allow using an empty string as argument for -D/--directory --- gallery_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index fff53eb5..19ea77b2 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -45,7 +45,7 @@ def main(): elif filename.startswith("\\f"): filename = "\f" + filename[2:] config.set((), "filename", filename) - if args.directory: + if args.directory is not None: config.set((), "base-directory", args.directory) config.set((), "directory", ()) if args.postprocessors: From 8995fd5f0114695732bb994a61abe317f72d9bde Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 09:55:39 +1100 Subject: [PATCH 62/77] [steamgriddb] implement suggestions --- docs/configuration.rst | 2 +- gallery_dl/extractor/steamgriddb.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index cfd67b3d..e54b2e6e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3131,7 +3131,7 @@ Type Default ``"all"`` Examples - * ``"png,jpg"`` + * ``"png,jpeg"`` * ``["jpeg", "webp"]`` Description Only include assets that are in the specified file types. ``all`` can be diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 1f803ffd..eb00a9f4 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -56,16 +56,16 @@ class SteamgriddbExtractor(Extractor): download_fake_png = self.config("download-fake-png", True) for asset in self.assets(): - urls = [asset["url"]] + urls = (asset["url"],) if download_fake_png and asset.get("fake_png"): - urls.append(asset["fake_png"]) + urls = (asset["url"], asset["fake_png"]) asset["count"] = len(urls) yield Message.Directory, asset for asset["num"], url in enumerate(urls, 1): yield Message.Url, url, text.nameext_from_url(url, asset) - def _call(self, endpoint: str, **kwargs): + def _call(self, endpoint, **kwargs): data = self.request(self.root + endpoint, **kwargs).json() if not data["success"]: raise exception.StopExtraction(data["error"]) @@ -129,17 +129,16 @@ class SteamgriddbAssetsExtractor(SteamgriddbExtractor): asset["game"] = data["game"] yield asset - if data["total"] > limit * page: - page += 1 - else: + if data["total"] <= limit * page: break + page += 1 def config_list(self, key, type_name, valid_values): - value = self.config(key, ["all"]) + value = self.config(key) if isinstance(value, str): value = value.split(",") - if "all" in value: + if value is None or "all" in value: return ["all"] for i in value: From 65f42442f562e91f74e3f4881f059007552be41e Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 10:12:15 +1100 Subject: [PATCH 63/77] [steamgriddb] implement another suggestion --- gallery_dl/extractor/steamgriddb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index eb00a9f4..9d46fd6b 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -56,9 +56,10 @@ class SteamgriddbExtractor(Extractor): download_fake_png = self.config("download-fake-png", True) for asset in self.assets(): - urls = (asset["url"],) if download_fake_png and asset.get("fake_png"): urls = (asset["url"], asset["fake_png"]) + else: + urls = (asset["url"],) asset["count"] = len(urls) yield Message.Directory, asset From 293f1559dfb24ccdb823f4bd023f6a9d1b88fb6f Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 10:42:22 +1100 Subject: [PATCH 64/77] [hatenablog] implement suggestions --- gallery_dl/extractor/hatenablog.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index dd1e45a5..40c36bb6 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -13,7 +13,7 @@ from .. import text BASE_PATTERN = ( r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" - r"([\w-]+\.(?:hatenablog\.com|hatenablog\.jp" + r"([\w-]+\.(?:hatenablog\.(?:com|jp)" r"|hatenadiary\.com|hateblo\.jp)))" ) QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" @@ -28,29 +28,26 @@ class HatenablogExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.domain = match.group(1) or match.group(2) - self._find_img = re.compile(r'<img +(.+?) */?>').finditer - self._is_image = re.compile( - r'(?: |^)class="hatena-fotolife"(?: |$)').search - self._find_img_src = re.compile(r'(?: |^)src="(.+?)"(?: |$)').search + + def _init(self): + self._find_img = re.compile(r'<img +([^>]+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) date = text.parse_datetime(extr('<time datetime="', '"')) - entry_link = text.unescape(extr( - '<a href="', '" class="entry-title-link bookmark">')) + entry_link = text.unescape(extr('<a href="', '"')) entry = entry_link.partition("/entry/")[2] - title = extr('', '</a>') + title = text.unescape(extr('>', '<')) content = extr( '<div class="entry-content hatenablog-entry">', '</div>') images = [] for i in self._find_img(content): attributes = i.group(1) - if not self._is_image(attributes): + if 'class="hatena-fotolife"' not in attributes: continue - image = text.unescape(self._find_img_src(attributes).group(1)) + image = text.unescape(text.extr(attributes, 'src="', '"')) images.append(image) data = { @@ -74,8 +71,11 @@ class HatenablogEntriesExtractor(HatenablogExtractor): self.path = match.group(3) self.query = {key: value for key, value in text.parse_query( match.group(4)).items() if self._acceptable_query(key)} + + def _init(self): + HatenablogExtractor._init(self) self._find_pager_url = re.compile( - r'<span class="pager-next">\s*<a href="(.+?)"').search + r' class="pager-next">\s*<a href="([^"]+)').search def items(self): url = "https://" + self.domain + self.path @@ -117,7 +117,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): article = extr('', '</article>') yield from self._handle_article(article) - def _acceptable_query(self, key: str) -> bool: + def _acceptable_query(self, key): return key == "page" or key in self.allowed_parameters @@ -154,8 +154,8 @@ class HatenablogHomeExtractor(HatenablogEntriesExtractor): class HatenablogArchiveExtractor(HatenablogEntriesExtractor): """Extractor for a blog's archive page""" subcategory = "archive" - pattern = BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + \ - r"|/category/[^?#]+)?)" + QUERY_RE + pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?" + r"|/category/[^?#]+)?)" + QUERY_RE) example = "https://BLOG.hatenablog.com/archive/2024" From 9f53daabb8e031871a604707bcc46f5359818910 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sat, 13 Jan 2024 10:43:25 +1100 Subject: [PATCH 65/77] [hatenablog] implement additional suggestion --- gallery_dl/extractor/hatenablog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 40c36bb6..792f6664 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -12,7 +12,7 @@ from .. import text BASE_PATTERN = ( - r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" + r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" r"([\w-]+\.(?:hatenablog\.(?:com|jp)" r"|hatenadiary\.com|hateblo\.jp)))" ) From bb446b15983ff0c09245fe58e9e9b997b73c4d77 Mon Sep 17 00:00:00 2001 From: blankie <blankie@nixnetmail.com> Date: Sun, 14 Jan 2024 19:26:49 +1100 Subject: [PATCH 66/77] [webtoons] extract more metadata --- gallery_dl/extractor/webtoons.py | 37 +++++++++++++++++++------------- test/results/webtoons.py | 12 +++++++++++ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 3f2f410d..1c7af470 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -87,23 +87,30 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): self.episode_no = params.get("episode_no") def metadata(self, page): - keywords, pos = text.extract( - page, '<meta name="keywords" content="', '"') - title, pos = text.extract( - page, '<meta property="og:title" content="', '"', pos) - descr, pos = text.extract( - page, '<meta property="og:description" content="', '"', pos) + extr = text.extract_from(page) + keywords = extr('<meta name="keywords" content="', '"').split(", ") + title = extr('<meta property="og:title" content="', '"') + descr = extr('<meta property="og:description" content="', '"') + + author_area = extr('<div class="author_area">', '</div>') + aa_extr = text.extract_from(author_area) + username = aa_extr('/creator/', '"') + author_name = aa_extr('<span>', '</span>') return { - "genre" : self.genre, - "comic" : self.comic, - "title_no" : self.title_no, - "episode_no" : self.episode_no, - "title" : text.unescape(title), - "episode" : keywords.split(", ")[1], - "description": text.unescape(descr), - "lang" : self.lang, - "language" : util.code_to_language(self.lang), + "genre" : self.genre, + "comic" : self.comic, + "title_no" : self.title_no, + "episode_no" : self.episode_no, + "title" : text.unescape(title), + "episode" : keywords[1], + "comic_name" : text.unescape(keywords[0]), + "episode_name": text.unescape(keywords[2]), + "username" : username, + "author_name" : text.unescape(author_name), + "description" : text.unescape(descr), + "lang" : self.lang, + "language" : util.code_to_language(self.lang), } @staticmethod diff --git a/test/results/webtoons.py b/test/results/webtoons.py index d2a177fd..9ca93446 100644 --- a/test/results/webtoons.py +++ b/test/results/webtoons.py @@ -37,6 +37,18 @@ __tests__ = ( "title_no" : "312584", }, +{ + "#url" : "https://www.webtoons.com/en/canvas/i-want-to-be-a-cute-anime-girl/209-the-storys-story/viewer?title_no=349416&episode_no=214", + "#category": ("", "webtoons", "episode"), + "#class" : webtoons.WebtoonsEpisodeExtractor, + "#count" : 4, + + "comic_name" : "I want to be a cute anime girl", + "episode_name": "209 - The story's story", + "username" : "m9huj", + "author_name" : "Azul Crescent", +}, + { "#url" : "https://www.webtoons.com/en/comedy/live-with-yourself/list?title_no=919", "#comment" : "english", From 69726fc82c96d54af72746b379948ffef103070a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Sun, 14 Jan 2024 22:09:26 +0100 Subject: [PATCH 67/77] [tests] skip tests requiring auth when non is provided --- test/results/coomerparty.py | 13 ++++++++++--- test/results/kemonoparty.py | 3 +++ test/test_results.py | 25 +++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/test/results/coomerparty.py b/test/results/coomerparty.py index dfc4a188..87c932e8 100644 --- a/test/results/coomerparty.py +++ b/test/results/coomerparty.py @@ -9,11 +9,18 @@ from gallery_dl.extractor import kemonoparty __tests__ = ( { - "#url" : "https://coomer.party/onlyfans/user/alinity/post/125962203", - "#comment" : "coomer.party (#2100)", + "#url" : "https://coomer.su/onlyfans/user/alinity/post/125962203", + "#comment" : "coomer (#2100)", "#category": ("", "coomerparty", "onlyfans"), "#class" : kemonoparty.KemonopartyPostExtractor, - "#pattern" : r"https://coomer\.party/data/7d/3f/7d3fd9804583dc224968c0591163ec91794552b04f00a6c2f42a15b68231d5a8\.jpg", + "#urls" : "https://coomer.su/data/7d/3f/7d3fd9804583dc224968c0591163ec91794552b04f00a6c2f42a15b68231d5a8.jpg", +}, + +{ + "#url" : "https://coomer.party/onlyfans/user/alinity/post/125962203", + "#category": ("", "coomerparty", "onlyfans"), + "#class" : kemonoparty.KemonopartyPostExtractor, + "#urls" : "https://coomer.party/data/7d/3f/7d3fd9804583dc224968c0591163ec91794552b04f00a6c2f42a15b68231d5a8.jpg", }, ) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index ad94a496..5bd541a3 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -297,6 +297,7 @@ __tests__ = ( "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyUserExtractor.pattern, + "#auth" : True, "#count" : 3, "#sha1_url": "f4b5b796979bcba824af84206578c79101c7f0e1", }, @@ -306,6 +307,7 @@ __tests__ = ( "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, + "#auth" : True, "#count" : 3, "#sha1_url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", }, @@ -315,6 +317,7 @@ __tests__ = ( "#category": ("", "kemonoparty", "favorite"), "#class" : kemonoparty.KemonopartyFavoriteExtractor, "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, + "#auth" : True, "#count" : 3, "#sha1_url": "4be8e84cb384a907a8e7997baaf6287b451783b5", }, diff --git a/test/test_results.py b/test/test_results.py index 575fc0f3..12fe59d5 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -38,6 +38,15 @@ CONFIG = { }, } +AUTH = { + "pixiv", + "nijie", + "horne", + "seiga", + "instagram", + "twitter", +} + class TestExtractorResults(unittest.TestCase): @@ -76,6 +85,18 @@ class TestExtractorResults(unittest.TestCase): for key, value in result["#options"].items(): key = key.split(".") config.set(key[:-1], key[-1], value) + + requires_auth = result.get("#auth") + if requires_auth is None: + requires_auth = (result["#category"][1] in AUTH) + if requires_auth: + extr = result["#class"].from_url(result["#url"]) + if not any(extr.config(key) for key in ( + "username", "cookies", "api-key", "client-id")): + msg = "no auth" + self._skipped.append((result["#url"], msg)) + self.skipTest(msg) + if "#range" in result: config.set((), "image-range" , result["#range"]) config.set((), "chapter-range", result["#range"]) @@ -371,7 +392,7 @@ def load_test_config(): except FileNotFoundError: pass except Exception as exc: - print("Error when loading {}: {}: {}".format( + sys.exit("Error when loading {}: {}: {}".format( path, exc.__class__.__name__, exc)) @@ -422,7 +443,7 @@ def generate_tests(): setattr(TestExtractorResults, method.__name__, method) -load_test_config() generate_tests() if __name__ == "__main__": + load_test_config() unittest.main(warnings="ignore") From 6c4abc982e79b3f7b65bebbeddee01e32ec3f36d Mon Sep 17 00:00:00 2001 From: hunter-gatherer8 <hunter.gatherer8@proton.me> Date: Fri, 18 Aug 2023 00:23:22 +0300 Subject: [PATCH 68/77] [2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/2ch.py | 84 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + 3 files changed, 91 insertions(+) create mode 100644 gallery_dl/extractor/2ch.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a704cf4..53c88335 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW. </tr> </thead> <tbody valign="top"> +<tr> + <td>2ch</td> + <td>https://2ch.hk/</td> + <td>Boards, Threads</td> + <td></td> +</tr> <tr> <td>2chen</td> <td>https://sturdychan.help/</td> diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 00000000..f841dd3c --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.2ch.hk/""" + +from .common import Extractor, Message +from .. import text + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{file_id} - {filename}.{extension}" + archive_fmt = "{board}_{thread}_{file_id}" + pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" + thread_data = self.request(url).json() + + posts = thread_data["threads"][0]["posts"] + post = posts[0] + title = post.get("subject") or text.remove_html(post["comment"]) + + thread_metadata = { + "board": self.board, + "thread": self.thread, + "title": text.unescape(title)[:50], + } + + yield Message.Directory, thread_metadata + for post in posts: + if "files" in post and post['files']: + for file in post['files']: + file_metadata = { + "post_num": post["num"], + "file_id": file["name"].split('.')[0], + "filename": ".".join(file["fullname"].split('.')[:-1]), + "extension": file["name"].split('.')[-1], + } + file_metadata.update(thread_metadata) + + url = f"https://2ch.hk/{file['path']}" + yield Message.Url, url, file_metadata + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def get_pages(self): + url = f"https://2ch.hk/{self.board}/index.json" + index_page = self.request(url).json() + pages_total = len(index_page['pages']) + + yield index_page + for i in range(1, pages_total): + url = f"https://2ch.hk/{self.board}/{i}.json" + yield self.request(url).json() + + def get_thread_nums(self): + for page in self.get_pages(): + for thread in page["threads"]: + yield thread["thread_num"] + + def items(self): + for thread_num in self.get_thread_nums(): + url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" + yield Message.Queue, url, {"_extractor": _2chThreadExtractor} diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 13d7b38b..8e712961 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ import sys import re modules = [ + "2ch", "2chan", "2chen", "35photo", From 68196589c42bf3fadea2437cf996293da1892176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 8 Jan 2024 02:04:34 +0100 Subject: [PATCH 69/77] [2ch] update - simplify extractor code - more metadata - add tests --- gallery_dl/extractor/2ch.py | 95 ++++++++++++++++++++----------------- test/results/2ch.py | 64 +++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 44 deletions(-) create mode 100644 test/results/2ch.py diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index f841dd3c..dbbf21b6 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -4,81 +4,88 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://www.2ch.hk/""" +"""Extractors for https://2ch.hk/""" from .common import Extractor, Message -from .. import text +from .. import text, util class _2chThreadExtractor(Extractor): """Extractor for 2ch threads""" category = "2ch" subcategory = "thread" + root = "https://2ch.hk" directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{file_id} - {filename}.{extension}" - archive_fmt = "{board}_{thread}_{file_id}" - pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + filename_fmt = "{tim}{filename:? //}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" + example = "https://2ch.hk/a/res/12345.html" def __init__(self, match): Extractor.__init__(self, match) self.board, self.thread = match.groups() def items(self): - url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" - thread_data = self.request(url).json() + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["threads"][0]["posts"] - posts = thread_data["threads"][0]["posts"] - post = posts[0] - title = post.get("subject") or text.remove_html(post["comment"]) + op = posts[0] + title = op.get("subject") or text.remove_html(op["comment"]) - thread_metadata = { - "board": self.board, + thread = { + "board" : self.board, "thread": self.thread, - "title": text.unescape(title)[:50], + "title" : text.unescape(title)[:50], } - yield Message.Directory, thread_metadata + yield Message.Directory, thread for post in posts: - if "files" in post and post['files']: - for file in post['files']: - file_metadata = { - "post_num": post["num"], - "file_id": file["name"].split('.')[0], - "filename": ".".join(file["fullname"].split('.')[:-1]), - "extension": file["name"].split('.')[-1], - } - file_metadata.update(thread_metadata) + files = post.get("files") + if files: + post["post_name"] = post["name"] + post["date"] = text.parse_timestamp(post["timestamp"]) + del post["files"] + del post["name"] - url = f"https://2ch.hk/{file['path']}" - yield Message.Url, url, file_metadata + for file in files: + file.update(thread) + file.update(post) + + file["filename"] = file["fullname"].rpartition(".")[0] + file["tim"], _, file["extension"] = \ + file["name"].rpartition(".") + + yield Message.Url, self.root + file["path"], file class _2chBoardExtractor(Extractor): """Extractor for 2ch boards""" category = "2ch" subcategory = "board" - pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + root = "https://2ch.hk" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" + example = "https://2ch.hk/a/" def __init__(self, match): Extractor.__init__(self, match) self.board = match.group(1) - def get_pages(self): - url = f"https://2ch.hk/{self.board}/index.json" - index_page = self.request(url).json() - pages_total = len(index_page['pages']) - - yield index_page - for i in range(1, pages_total): - url = f"https://2ch.hk/{self.board}/{i}.json" - yield self.request(url).json() - - def get_thread_nums(self): - for page in self.get_pages(): - for thread in page["threads"]: - yield thread["thread_num"] - def items(self): - for thread_num in self.get_thread_nums(): - url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" - yield Message.Queue, url, {"_extractor": _2chThreadExtractor} + # index page + url = "{}/{}/index.json".format(self.root, self.board) + index = self.request(url).json() + index["_extractor"] = _2chThreadExtractor + for thread in index["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, index + + # pages 1..n + for n in util.advance(index["pages"], 1): + url = "{}/{}/{}.json".format(self.root, self.board, n) + page = self.request(url).json() + page["_extractor"] = _2chThreadExtractor + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, page diff --git a/test/results/2ch.py b/test/results/2ch.py new file mode 100644 index 00000000..5400292c --- /dev/null +++ b/test/results/2ch.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.2ch") +_2ch = getattr(gallery_dl.extractor, "2ch") + + +__tests__ = ( +{ + "#url" : "https://2ch.hk/a/res/6202876.html", + "#category": ("", "2ch", "thread"), + "#class" : _2ch._2chThreadExtractor, + "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+", + "#count" : range(450, 1000), + + "banned" : 0, + "board" : "a", + "closed" : 0, + "comment" : str, + "date" : "type:datetime", + "displayname": str, + "email" : "", + "endless" : 1, + "extension": str, + "filename" : str, + "fullname" : str, + "height" : int, + "lasthit" : 1705273977, + "md5" : r"re:[0-9a-f]{32}", + "name" : r"re:\d+\.\w+", + "num" : int, + "number" : range(1, 1000), + "op" : 0, + "parent" : int, + "path" : r"re:/a/src/6202876/\d+\.\w+", + "post_name": "Аноним", + "size" : int, + "sticky" : 0, + "subject" : str, + "thread" : "6202876", + "thumbnail": str, + "tim" : r"re:\d+", + "timestamp": int, + "title" : "MP4/WEBM", + "tn_height": int, + "tn_width" : int, + "trip" : "", + "type" : int, + "views" : int, + "width" : int, +}, + +{ + "#url" : "https://2ch.hk/a/", + "#category": ("", "2ch", "board"), + "#class" : _2ch._2chBoardExtractor, + "#pattern" : _2ch._2chThreadExtractor.pattern, + "#count" : range(200, 300), +}, + +) From 4cedf378d5548889256b0192ab4e081e5c570f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 16:28:57 +0100 Subject: [PATCH 70/77] [deviantart] fix AttributeError for URLs without username (#5065) caused by 4f367145 --- gallery_dl/extractor/deviantart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 7df1890e..a46517cd 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = (match.group(1) or match.group(2)).lower() + self.user = (match.group(1) or match.group(2) or "").lower() self.offset = 0 def _init(self): From 90b382304a1e8580f888a2c84ca95f74c2827710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 17:30:03 +0100 Subject: [PATCH 71/77] [deviantart] fix KeyError: 'premium_folder_data' (#5063) --- gallery_dl/extractor/deviantart.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index a46517cd..bcfbe73b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -452,9 +452,11 @@ class DeviantartExtractor(Extractor): return None dev = self.api.deviation(deviation["deviationid"], False) - folder = dev["premium_folder_data"] + folder = deviation["premium_folder_data"] username = dev["author"]["username"] - has_access = folder["has_access"] + + # premium_folder_data is no longer present when user has access (#5063) + has_access = ("premium_folder_data" not in dev) or folder["has_access"] if not has_access and folder["type"] == "watchers" and \ self.config("auto-watch"): From 8ffa0cd3c8c4f6d15ad281bf449812b7bf415bcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 18:24:47 +0100 Subject: [PATCH 72/77] [webtoons] small optimization don't extract the entire 'author_area' and avoid creating a second 'text.extract_from()' object --- gallery_dl/extractor/webtoons.py | 9 +++++---- test/results/webtoons.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 1c7af470..a4259358 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -92,10 +92,11 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): title = extr('<meta property="og:title" content="', '"') descr = extr('<meta property="og:description" content="', '"') - author_area = extr('<div class="author_area">', '</div>') - aa_extr = text.extract_from(author_area) - username = aa_extr('/creator/', '"') - author_name = aa_extr('<span>', '</span>') + if extr('<div class="author_area"', '\n'): + username = extr('/creator/', '"') + author_name = extr('<span>', '</span>') + else: + username = author_name = "" return { "genre" : self.genre, diff --git a/test/results/webtoons.py b/test/results/webtoons.py index 9ca93446..82831f02 100644 --- a/test/results/webtoons.py +++ b/test/results/webtoons.py @@ -20,6 +20,22 @@ __tests__ = ( "42055e44659f6ffc410b3fb6557346dfbb993df3", "49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9", ], + + "author_name" : "Chris McCoy", + "comic" : "safely-endangered", + "comic_name" : "Safely Endangered", + "count" : 5, + "description" : "Silly comics for silly people.", + "episode" : "572", + "episode_name": "Ep. 572 - Earth", + "episode_no" : "572", + "genre" : "comedy", + "lang" : "en", + "language" : "English", + "num" : range(1, 5), + "title" : "Safely Endangered - Ep. 572 - Earth", + "title_no" : "352", + "username" : "safelyendangered", }, { From 4d6ec6958d29bd22739ba5fe27086e715d51fbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 15 Jan 2024 22:37:33 +0100 Subject: [PATCH 73/77] [scripts] add 'push --force' to pull-request --- scripts/pull-request | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/pull-request b/scripts/pull-request index defdc11f..dea9b292 100755 --- a/scripts/pull-request +++ b/scripts/pull-request @@ -41,6 +41,10 @@ case "${2,,}" in call git push "$USER" HEAD:"$BRANCH" ;; +"pf"|"push-force") + call git push --force "$USER" HEAD:"$BRANCH" + ;; + "d"|"delete") call git switch master call git branch -D "$USER-$BRANCH" From 3d68eda4abcfde18ecf377f140b8ad6ec4c2de6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 16 Jan 2024 00:24:30 +0100 Subject: [PATCH 74/77] [kemonoparty] add 'revision_hash' metadata (#4706, #4727, #5013) A SHA1 hexdigest of other relevant metadata fields like title, content, file and attachment URLs. This value does NOT reflect which revisions are listed on the website. Neither does 'edited' or any other metadata field (combinations). --- gallery_dl/extractor/kemonoparty.py | 26 ++++++++++++++++++++++---- test/results/kemonoparty.py | 2 ++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index c24e57d1..10228b5c 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -9,9 +9,10 @@ """Extractors for https://kemono.party/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache, memcache import itertools +import json import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" @@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor): Extractor.__init__(self, match) def _init(self): + self.revisions = self.config("revisions") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall + self._json_dumps = json.JSONEncoder( + ensure_ascii=False, check_circular=False, + sort_keys=True, separators=(",", ":")).encode def items(self): find_hash = re.compile(HASH_PATTERN).match @@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor): idx = len(revs) for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx idx -= 1 return revs + def _revision_hash(self, revision): + rev = revision.copy() + rev.pop("revision_id", None) + rev.pop("added", None) + rev.pop("next", None) + rev.pop("prev", None) + rev["file"].pop("name", None) + for a in rev["attachments"]: + a.pop("name", None) + return util.sha1(self._json_dumps(rev)) + def _validate(response): return (response.headers["content-length"] != "9" or @@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor): url = self.api_url params = text.parse_query(self.query) params["o"] = text.parse_int(params.get("o")) - revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - if revisions: + if self.revisions: for post in posts: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 post_url = "{}/post/{}".format(self.api_url, post["id"]) try: @@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor): def posts(self): if not self.revision: post = self.request(self.api_url).json() - if self.config("revisions"): + if self.revisions: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 try: revs = self._post_revisions(self.api_url) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 5bd541a3..c3dbdf73 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -177,6 +177,7 @@ __tests__ = ( "revision_id": 142470, "revision_index": 2, + "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", }, { @@ -190,6 +191,7 @@ __tests__ = ( "revision_id": range(134996, 3052965), "revision_index": range(1, 9), + "revision_hash": r"re:^[0-9a-f]{40}$", }, From e33056adcd1469a80f1f7656848d1cf6cde5b3f6 Mon Sep 17 00:00:00 2001 From: Ailothaen <mail@ailothaen.fr> Date: Sun, 27 Feb 2022 19:40:15 +0100 Subject: [PATCH 75/77] [wikimedia] Add Wikipedia/Wikimedia extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/wikimedia.py | 172 ++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 gallery_dl/extractor/wikimedia.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8e712961..86308917 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -178,6 +178,7 @@ modules = [ "weibo", "wikiart", "wikifeet", + "wikimedia", "xhamster", "xvideos", "zerochan", diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py new file mode 100644 index 00000000..41cc1c9e --- /dev/null +++ b/gallery_dl/extractor/wikimedia.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022-2022 Ailothaen +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Wikimedia and Wikipedia. +(Other Mediawiki instances use the same API,so a similar extractor +could be written) + +Various reference: +https://www.mediawiki.org/wiki/API:Query +https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category +""" + +from .common import Extractor, Message +import time +import re + + +class WikimediaArticleExtractor(Extractor): + category = "wikimedia" + subcategory = "article" + filename_fmt = "{filename}.{extension}" + archive_fmt = "{filename}" + pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" + directory_fmt = ("{category}", "{page}") + test = ( + ("https://en.wikipedia.org/wiki/Athena"), + ("https://zh.wikipedia.org/wiki/太阳"), + ("https://simple.wikipedia.org/wiki/Hydrogen", { + "count": ">= 2" + }) + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.lang, self.page = match.groups() + + def items(self): + continuation = None + gimcontinuation = None + + while True: + if continuation is None: + file_list_request = self.request( + "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa + lang=self.lang, page=self.page + ) + ) + else: + file_list_request = self.request( + "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa + lang=self.lang, + page=self.page, + continuation=continuation, + gimcontinuation=gimcontinuation, + ) + ) + file_list = file_list_request.json() + + for file_index in list(file_list["query"]["pages"]): + image = file_list["query"]["pages"][file_index]["imageinfo"][0] + + metadata = image + metadata["filename"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[0] + metadata["extension"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[1] + + yield Message.Directory, {"page": self.page, "lang": self.lang} + yield Message.Url, image["url"], image + else: + # We arrived at the end of the response + # checking if there are more files to retrieve + try: + continuation_info = file_list["continue"] + except KeyError: + # No more continuation info: all files were retrieved + break + else: + # Continuation info is present + # there are still files to retrieve + continuation = continuation_info["continue"] + gimcontinuation = continuation_info["gimcontinue"] + + # giving a rest to Wikipedia API + time.sleep(1) + + +class WikimediaCategoryExtractor(Extractor): + category = "wikimedia" + subcategory = "category" + filename_fmt = "{filename}.{extension}" + archive_fmt = "{filename}" + pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" + directory_fmt = ("{category}", "{page}") + + test = ( + ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa + ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa + "count": ">= 21" + }) + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.page = match.groups()[0] + + def items(self): + continuation = None + gcmcontinuation = None + + while True: + if continuation is None: + file_list_request = self.request( + "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa + page=self.page + ) + ) + else: + file_list_request = self.request( + "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa + page=self.page, + continuation=continuation, + gcmcontinuation=gcmcontinuation, + ) + ) + file_list = file_list_request.json() + + for file_index in list(file_list["query"]["pages"]): + image = file_list["query"]["pages"][file_index]["imageinfo"][0] + + metadata = image + metadata["filename"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[0] + metadata["extension"] = WikimediaUtils.clean_name( + image["canonicaltitle"] + )[1] + + yield Message.Directory, {"page": self.page, "lang": "common"} + yield Message.Url, image["url"], image + else: + # We arrived at the end of the response + # checking if there are more files to retrieve + try: + continuation_info = file_list["continue"] + except KeyError: + # No more continuation info: all files were retrieved + break + else: + # Continuation info is present + # there are still files to retrieve + continuation = continuation_info["continue"] + gcmcontinuation = continuation_info["gcmcontinue"] + + # giving a rest to Wikipedia API + time.sleep(1) + + +class WikimediaUtils: + @staticmethod + def clean_name(name): + name = re.sub(r"^\w+:", "", name) + filename = ".".join(name.split(".")[:-1]) + extension = name.split(".")[-1] + return filename, extension From 221f54309cf5437ad887e89a5c71d1a4263294d6 Mon Sep 17 00:00:00 2001 From: Ailothaen <mail@ailothaen.fr> Date: Mon, 25 Apr 2022 23:14:16 +0200 Subject: [PATCH 76/77] [wikimedia] Improved archive identifiers --- gallery_dl/extractor/wikimedia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 41cc1c9e..a2ddfa2c 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -24,7 +24,7 @@ class WikimediaArticleExtractor(Extractor): category = "wikimedia" subcategory = "article" filename_fmt = "{filename}.{extension}" - archive_fmt = "{filename}" + archive_fmt = "a_{sha1}" pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" directory_fmt = ("{category}", "{page}") test = ( @@ -96,7 +96,7 @@ class WikimediaCategoryExtractor(Extractor): category = "wikimedia" subcategory = "category" filename_fmt = "{filename}.{extension}" - archive_fmt = "{filename}" + archive_fmt = "c_{sha1}" pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" directory_fmt = ("{category}", "{page}") From c3c1635ef35df7ef3f8884bd933578e79a2ade8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 16 Jan 2024 22:08:03 +0100 Subject: [PATCH 77/77] [wikimedia] update - rewrite using BaseExtractor - support most Wiki* domains - update docs/supportedsites - add tests --- docs/supportedsites.md | 58 ++++++ gallery_dl/extractor/wikimedia.py | 284 ++++++++++++++---------------- scripts/supportedsites.py | 1 + test/results/wikibooks.py | 23 +++ test/results/wikimediacommons.py | 23 +++ test/results/wikinews.py | 23 +++ test/results/wikipedia.py | 53 ++++++ test/results/wikiquote.py | 23 +++ test/results/wikisource.py | 23 +++ test/results/wikispecies.py | 25 +++ test/results/wikiversity.py | 23 +++ test/results/wiktionary.py | 23 +++ 12 files changed, 426 insertions(+), 156 deletions(-) create mode 100644 test/results/wikibooks.py create mode 100644 test/results/wikimediacommons.py create mode 100644 test/results/wikinews.py create mode 100644 test/results/wikipedia.py create mode 100644 test/results/wikiquote.py create mode 100644 test/results/wikisource.py create mode 100644 test/results/wikispecies.py create mode 100644 test/results/wikiversity.py create mode 100644 test/results/wiktionary.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53c88335..d3d2a8a3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW. <td></td> </tr> +<tr> + <td colspan="4"><strong>Wikimedia Instances</strong></td> +</tr> +<tr> + <td>Wikipedia</td> + <td>https://www.wikipedia.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wiktionary</td> + <td>https://www.wiktionary.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikiquote</td> + <td>https://www.wikiquote.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikibooks</td> + <td>https://www.wikibooks.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikisource</td> + <td>https://www.wikisource.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikinews</td> + <td>https://www.wikinews.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikiversity</td> + <td>https://www.wikiversity.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikispecies</td> + <td>https://species.wikimedia.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> +<tr> + <td>Wikimedia Commons</td> + <td>https://commons.wikimedia.org/</td> + <td>Articles, Categories</td> + <td></td> +</tr> + <tr> <td colspan="4"><strong>Moebooru and MyImouto</strong></td> </tr> diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index a2ddfa2c..1a896515 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -1,172 +1,144 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2022 Ailothaen +# Copyright 2022 Ailothaen +# Copyright 2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Wikimedia and Wikipedia. -(Other Mediawiki instances use the same API,so a similar extractor -could be written) +"""Extractors for Wikimedia and Wikipedia""" -Various reference: -https://www.mediawiki.org/wiki/API:Query -https://opendata.stackexchange.com/questions/13381/wikimedia-commons-api-image-by-category -""" - -from .common import Extractor, Message -import time -import re +from .common import BaseExtractor, Message +from .. import text -class WikimediaArticleExtractor(Extractor): - category = "wikimedia" +class WikimediaExtractor(BaseExtractor): + """Base class for wikimedia extractors""" + basecategory = "wikimedia" + directory_fmt = ("{category}", "{page}") + archive_fmt = "{sha1}" + request_interval = (1.0, 2.0) + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.title = match.group(match.lastindex) + + def items(self): + for info in self._pagination(self.params): + image = info["imageinfo"][0] + + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"]} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"]} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + image["page"] = self.title + + yield Message.Directory, image + yield Message.Url, image["url"], image + + def _pagination(self, params): + """ + https://www.mediawiki.org/wiki/API:Query + https://opendata.stackexchange.com/questions/13381 + """ + + url = self.root + "/w/api.php" + params["action"] = "query" + params["format"] = "json" + + while True: + data = self.request(url, params=params).json() + + try: + pages = data["query"]["pages"] + except KeyError: + pass + else: + yield from pages.values() + + try: + continuation = data["continue"] + except KeyError: + break + params.update(continuation) + + +BASE_PATTERN = WikimediaExtractor.update({ + "wikipedia": { + "root": None, + "pattern": r"[a-z]{2,}\.wikipedia\.org", + }, + "wiktionary": { + "root": None, + "pattern": r"[a-z]{2,}\.wiktionary\.org", + }, + "wikiquote": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiquote\.org", + }, + "wikibooks": { + "root": None, + "pattern": r"[a-z]{2,}\.wikibooks\.org", + }, + "wikisource": { + "root": None, + "pattern": r"[a-z]{2,}\.wikisource\.org", + }, + "wikinews": { + "root": None, + "pattern": r"[a-z]{2,}\.wikinews\.org", + }, + "wikiversity": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiversity\.org", + }, + "wikispecies": { + "root": "https://species.wikimedia.org", + "pattern": r"species\.wikimedia\.org", + }, + "wikimediacommons": { + "root": "https://commons.wikimedia.org", + "pattern": r"commons\.wikimedia\.org", + }, +}) + + +class WikimediaArticleExtractor(WikimediaExtractor): + """Extractor for wikimedia articles""" subcategory = "article" - filename_fmt = "{filename}.{extension}" - archive_fmt = "a_{sha1}" - pattern = r"https?://([a-z]{2,})\.wikipedia\.org/wiki/([^#/\?]+)" - directory_fmt = ("{category}", "{page}") - test = ( - ("https://en.wikipedia.org/wiki/Athena"), - ("https://zh.wikipedia.org/wiki/太阳"), - ("https://simple.wikipedia.org/wiki/Hydrogen", { - "count": ">= 2" - }) - ) + pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" + example = "https://en.wikipedia.org/wiki/TITLE" - def __init__(self, match): - Extractor.__init__(self, match) - self.lang, self.page = match.groups() - - def items(self): - continuation = None - gimcontinuation = None - - while True: - if continuation is None: - file_list_request = self.request( - "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa - lang=self.lang, page=self.page - ) - ) - else: - file_list_request = self.request( - "https://{lang}.wikipedia.org/w/api.php?action=query&generator=images&format=json&titles={page}&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gimcontinue={gimcontinuation}".format( # noqa - lang=self.lang, - page=self.page, - continuation=continuation, - gimcontinuation=gimcontinuation, - ) - ) - file_list = file_list_request.json() - - for file_index in list(file_list["query"]["pages"]): - image = file_list["query"]["pages"][file_index]["imageinfo"][0] - - metadata = image - metadata["filename"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[0] - metadata["extension"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[1] - - yield Message.Directory, {"page": self.page, "lang": self.lang} - yield Message.Url, image["url"], image - else: - # We arrived at the end of the response - # checking if there are more files to retrieve - try: - continuation_info = file_list["continue"] - except KeyError: - # No more continuation info: all files were retrieved - break - else: - # Continuation info is present - # there are still files to retrieve - continuation = continuation_info["continue"] - gimcontinuation = continuation_info["gimcontinue"] - - # giving a rest to Wikipedia API - time.sleep(1) + def _init(self): + self.params = { + "generator": "images", + "titles" : self.title, + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } -class WikimediaCategoryExtractor(Extractor): - category = "wikimedia" +class WikimediaCategoryExtractor(WikimediaExtractor): subcategory = "category" - filename_fmt = "{filename}.{extension}" - archive_fmt = "c_{sha1}" - pattern = r"https?://commons.wikimedia.org/wiki/Category:([^#/\?]+)" - directory_fmt = ("{category}", "{page}") + pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)" + example = "https://commons.wikimedia.org/wiki/Category:NAME" - test = ( - ("https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro"), # noqa - ("https://commons.wikimedia.org/wiki/Category:Tyto_alba_in_flight_(captive)", { # noqa - "count": ">= 21" - }) - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page = match.groups()[0] - - def items(self): - continuation = None - gcmcontinuation = None - - while True: - if continuation is None: - file_list_request = self.request( - "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth".format( # noqa - page=self.page - ) - ) - else: - file_list_request = self.request( - "https://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{page}&gcmtype=file&prop=imageinfo&format=json&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|sha1|mime|metadata|commonmetadata|extmetadata|bitdepth&continue={continuation}&gcmcontinue={gcmcontinuation}".format( # noqa - page=self.page, - continuation=continuation, - gcmcontinuation=gcmcontinuation, - ) - ) - file_list = file_list_request.json() - - for file_index in list(file_list["query"]["pages"]): - image = file_list["query"]["pages"][file_index]["imageinfo"][0] - - metadata = image - metadata["filename"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[0] - metadata["extension"] = WikimediaUtils.clean_name( - image["canonicaltitle"] - )[1] - - yield Message.Directory, {"page": self.page, "lang": "common"} - yield Message.Url, image["url"], image - else: - # We arrived at the end of the response - # checking if there are more files to retrieve - try: - continuation_info = file_list["continue"] - except KeyError: - # No more continuation info: all files were retrieved - break - else: - # Continuation info is present - # there are still files to retrieve - continuation = continuation_info["continue"] - gcmcontinuation = continuation_info["gcmcontinue"] - - # giving a rest to Wikipedia API - time.sleep(1) - - -class WikimediaUtils: - @staticmethod - def clean_name(name): - name = re.sub(r"^\w+:", "", name) - filename = ".".join(name.split(".")[:-1]) - extension = name.split(".")[-1] - return filename, extension + def _init(self): + self.params = { + "generator": "categorymembers", + "gcmtitle" : self.title, + "gcmtype" : "file", + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index d3107b47..34566465 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -139,6 +139,7 @@ CATEGORY_MAP = { "webmshare" : "webmshare", "webtoons" : "Webtoon", "wikiart" : "WikiArt.org", + "wikimediacommons": "Wikimedia Commons", "xbunkr" : "xBunkr", "xhamster" : "xHamster", "xvideos" : "XVideos", diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py new file mode 100644 index 00000000..882741d5 --- /dev/null +++ b/test/results/wikibooks.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikibooks.org/wiki/Title", + "#category": ("wikimedia", "wikibooks", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikibooks.org/wiki/Category:Title", + "#category": ("wikimedia", "wikibooks", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py new file mode 100644 index 00000000..6cc03e34 --- /dev/null +++ b/test/results/wikimediacommons.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg", + "#category": ("wikimedia", "wikimediacommons", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", + "#category": ("wikimedia", "wikimediacommons", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikinews.py b/test/results/wikinews.py new file mode 100644 index 00000000..8a2af25e --- /dev/null +++ b/test/results/wikinews.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikinews.org/wiki/Title", + "#category": ("wikimedia", "wikinews", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikinews.org/wiki/Category:Title", + "#category": ("wikimedia", "wikinews", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py new file mode 100644 index 00000000..87499878 --- /dev/null +++ b/test/results/wikipedia.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikipedia.org/wiki/Title", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Athena", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#pattern" : r"https://upload.wikimedia.org/wikipedia/.+", + "#count" : range(50, 100), + + "bitdepth" : int, + "canonicaltitle": str, + "comment" : str, + "commonmetadata": dict, + "date" : "type:datetime", + "descriptionshorturl": str, + "descriptionurl": str, + "extension" : str, + "extmetadata" : dict, + "filename" : str, + "height" : int, + "metadata" : dict, + "mime" : r"re:image/\w+", + "page" : "Athena", + "sha1" : r"re:^[0-9a-f]{40}$", + "size" : int, + "timestamp" : str, + "url" : str, + "user" : str, + "userid" : int, + "width" : int, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Category:Physics", + "#category": ("wikimedia", "wikipedia", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py new file mode 100644 index 00000000..5e6fb321 --- /dev/null +++ b/test/results/wikiquote.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiquote.org/wiki/Title", + "#category": ("wikimedia", "wikiquote", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiquote.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiquote", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikisource.py b/test/results/wikisource.py new file mode 100644 index 00000000..afdee23e --- /dev/null +++ b/test/results/wikisource.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikisource.org/wiki/Title", + "#category": ("wikimedia", "wikisource", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikisource.org/wiki/Category:Title", + "#category": ("wikimedia", "wikisource", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py new file mode 100644 index 00000000..d455fbac --- /dev/null +++ b/test/results/wikispecies.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://species.wikimedia.org/wiki/Geranospiza", + "#category": ("wikimedia", "wikispecies", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : "https://upload.wikimedia.org/wikipedia/commons/0/01/Geranospiza_caerulescens.jpg", + "#sha1_content": "3a17c14b15489928e4154f826af1c42afb5a523e", +}, + +{ + "#url" : "https://species.wikimedia.org/wiki/Category:Names", + "#category": ("wikimedia", "wikispecies", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py new file mode 100644 index 00000000..58565f49 --- /dev/null +++ b/test/results/wikiversity.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiversity.org/wiki/Title", + "#category": ("wikimedia", "wikiversity", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiversity.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiversity", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py new file mode 100644 index 00000000..c7a016f5 --- /dev/null +++ b/test/results/wiktionary.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wiktionary.org/wiki/Word", + "#category": ("wikimedia", "wiktionary", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wiktionary.org/wiki/Category:Words", + "#category": ("wikimedia", "wiktionary", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +)