From d31a3b5da38bd63ede507896eb738a87d5374dc5 Mon Sep 17 00:00:00 2001 From: missionfloyd Date: Mon, 23 Sep 2024 00:35:19 -0600 Subject: [PATCH 1/2] [everia.club] Add support - Unescape title and URL - Add tags and categories metadata Lookup tag id with API instead of downloading tag page - Add category extractor - Add tests - Rename EveriaExtractor to EveriaPostExtractor - Fix EveriaPostExtractor example - Lookup tags/categories by post id - Add date extractor - Remove leftover pages parameter - Add error handling for invalid dates. - Add filename numbering Parse date - Rename extract() to images() - Remove html import - Fix search/date URLs with page number - Fix tag/category search - Fix post extractor - Fix tag, category extractors - Fix search extractor - Only load first page once - Fix date extractor - Fix tests - Clean up search extractor --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/everia.py | 106 +++++++++++++++++++++++++++++++ test/results/everia.py | 37 +++++++++++ 4 files changed, 150 insertions(+) create mode 100644 gallery_dl/extractor/everia.py create mode 100644 test/results/everia.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ba5aed8f..e5eac728 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -229,6 +229,12 @@ Consider all listed sites to potentially be NSFW. Albums, Search Results, User Profiles + + Everia + https://everia.club + Categories, Dates, Posts, Search Results, Tag Searches + + ExHentai https://exhentai.org/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 067ec013..b707ea25 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -47,6 +47,7 @@ modules = [ "dynastyscans", "e621", "erome", + "everia", "exhentai", "fanbox", "fanleaks", diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py new file mode 100644 index 00000000..c46d5605 --- /dev/null +++ b/gallery_dl/extractor/everia.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019-2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://everia.club""" + +from .common import Extractor, Message +from .. import text +import re + +BASE_PATTERN = r"(?:https?://)?everia\.club" + + +class EveriaPostExtractor(Extractor): + category = "everia" + subcategory = "post" + root = "https://everia.club" + pattern = BASE_PATTERN + r"/(\d{4}/\d{2}/\d{2}/[^/]+)/?" + example = "https://everia.club/0000/00/00/TITLE" + directory_fmt = ("{category}", "{title}") + + def __init__(self, match): + super().__init__(match) + self.url = match.group(0) + + def items(self): + page = self.request(self.url).text + content = text.extr(page, 'itemprop="text">', "") + urls = re.findall(r'img.*?src=\"(.+?)\"', content) + + data = { + "title": text.unescape( + text.extr(page, 'itemprop="headline">', "") + ), + "url": self.url, + "tags": list(text.extract_iter(page, 'rel="tag">', "")), + "post_category": text.extr( + page, "post-in-category-", " " + ).capitalize(), + "count": len(urls), + } + + yield Message.Directory, data + for data["num"], url in enumerate(urls, 1): + text.nameext_from_url(text.unquote(url), data) + yield Message.Url, url, data + + +class EveriaTagExtractor(EveriaPostExtractor): + subcategory = "tag" + pattern = BASE_PATTERN + r"/(tag/[^/]+)/?" + example = "https://everia.club/tag/TAG" + + def __init__(self, match): + super().__init__(match) + self.id = match.group(1) + + def _posts(self, page): + posts = re.findall(r'thumbnail\">\s* Date: Sun, 3 Nov 2024 17:51:04 +0100 Subject: [PATCH 2/2] [everia] update - implement general _pagination method - simplify code - adjust URL patterns - update test results --- docs/supportedsites.md | 2 +- gallery_dl/extractor/everia.py | 121 ++++++++++++++++----------------- scripts/supportedsites.py | 1 + test/results/everia.py | 48 ++++++++----- 4 files changed, 90 insertions(+), 82 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e5eac728..bd4b01db 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -230,7 +230,7 @@ Consider all listed sites to potentially be NSFW. - Everia + EVERIA.CLUB https://everia.club Categories, Dates, Posts, Search Results, Tag Searches diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index c46d5605..94444ffb 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann -# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. @@ -15,92 +13,87 @@ import re BASE_PATTERN = r"(?:https?://)?everia\.club" -class EveriaPostExtractor(Extractor): +class EveriaExtractor(Extractor): category = "everia" - subcategory = "post" root = "https://everia.club" - pattern = BASE_PATTERN + r"/(\d{4}/\d{2}/\d{2}/[^/]+)/?" - example = "https://everia.club/0000/00/00/TITLE" - directory_fmt = ("{category}", "{title}") - - def __init__(self, match): - super().__init__(match) - self.url = match.group(0) def items(self): - page = self.request(self.url).text + data = {"_extractor": EveriaPostExtractor} + for url in self.posts(): + yield Message.Queue, url, data + + def posts(self): + return self._pagination(self.groups[0]) + + def _pagination(self, path, params=None, pnum=1): + find_posts = re.compile(r'thumbnail">\s*= 300: + return + + yield from find_posts(response.text) + pnum += 1 + + +class EveriaPostExtractor(EveriaExtractor): + subcategory = "post" + directory_fmt = ("{category}", "{title}") + archive_fmt = "{post_url}_{num}" + pattern = BASE_PATTERN + r"(/\d{4}/\d{2}/\d{2}/[^/?#]+)" + example = "https://everia.club/0000/00/00/TITLE" + + def items(self): + url = self.root + self.groups[0] + page = self.request(url).text content = text.extr(page, 'itemprop="text">', "") - urls = re.findall(r'img.*?src=\"(.+?)\"', content) + urls = re.findall(r'img.*?src="([^"]+)', content) data = { "title": text.unescape( - text.extr(page, 'itemprop="headline">', "") - ), - "url": self.url, + text.extr(page, 'itemprop="headline">', "")), "tags": list(text.extract_iter(page, 'rel="tag">', "")), + "post_url": url, "post_category": text.extr( - page, "post-in-category-", " " - ).capitalize(), + page, "post-in-category-", " ").capitalize(), "count": len(urls), } yield Message.Directory, data for data["num"], url in enumerate(urls, 1): - text.nameext_from_url(text.unquote(url), data) - yield Message.Url, url, data + yield Message.Url, url, text.nameext_from_url(url, data) -class EveriaTagExtractor(EveriaPostExtractor): +class EveriaTagExtractor(EveriaExtractor): subcategory = "tag" - pattern = BASE_PATTERN + r"/(tag/[^/]+)/?" + pattern = BASE_PATTERN + r"(/tag/[^/?#]+)" example = "https://everia.club/tag/TAG" - def __init__(self, match): - super().__init__(match) - self.id = match.group(1) - def _posts(self, page): - posts = re.findall(r'thumbnail\">\s* 50", }, + { - "#url" : "https://everia.club/category/japan/", - "#category" : ("", "everia", "category"), - "#class" : everia.EveriaCategoryExtractor, + "#url" : "https://everia.club/category/japan/", + "#class" : everia.EveriaCategoryExtractor, + "#pattern": everia.EveriaPostExtractor.pattern, + "#range" : "1-50", + "#count" : 50, }, + { - "#url" : "https://everia.club/?s=saika", - "#category" : ("", "everia", "search"), - "#class" : everia.EveriaSearchExtractor, -} + "#url" : "https://everia.club/2023/10/05/", + "#class" : everia.EveriaDateExtractor, + "#pattern": everia.EveriaPostExtractor.pattern, + "#count" : 34, +}, + +{ + "#url" : "https://everia.club/?s=saika", + "#class" : everia.EveriaSearchExtractor, + "#pattern": everia.EveriaPostExtractor.pattern, + "#range" : "1-15", + "#count" : 15, +}, + )