From cea062ffc5cbf04dfc1e4fec5adfe3027b99043f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 3 Nov 2024 17:51:04 +0100 Subject: [PATCH] [everia] update - implement general _pagination method - simplify code - adjust URL patterns - update test results --- docs/supportedsites.md | 2 +- gallery_dl/extractor/everia.py | 121 ++++++++++++++++----------------- scripts/supportedsites.py | 1 + test/results/everia.py | 48 ++++++++----- 4 files changed, 90 insertions(+), 82 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e5eac728..bd4b01db 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -230,7 +230,7 @@ Consider all listed sites to potentially be NSFW. - Everia + EVERIA.CLUB https://everia.club Categories, Dates, Posts, Search Results, Tag Searches diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index c46d5605..94444ffb 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann -# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. @@ -15,92 +13,87 @@ import re BASE_PATTERN = r"(?:https?://)?everia\.club" -class EveriaPostExtractor(Extractor): +class EveriaExtractor(Extractor): category = "everia" - subcategory = "post" root = "https://everia.club" - pattern = BASE_PATTERN + r"/(\d{4}/\d{2}/\d{2}/[^/]+)/?" - example = "https://everia.club/0000/00/00/TITLE" - directory_fmt = ("{category}", "{title}") - - def __init__(self, match): - super().__init__(match) - self.url = match.group(0) def items(self): - page = self.request(self.url).text + data = {"_extractor": EveriaPostExtractor} + for url in self.posts(): + yield Message.Queue, url, data + + def posts(self): + return self._pagination(self.groups[0]) + + def _pagination(self, path, params=None, pnum=1): + find_posts = re.compile(r'thumbnail">\s*= 300: + return + + yield from find_posts(response.text) + pnum += 1 + + +class EveriaPostExtractor(EveriaExtractor): + subcategory = "post" + directory_fmt = ("{category}", "{title}") + archive_fmt = "{post_url}_{num}" + pattern = BASE_PATTERN + r"(/\d{4}/\d{2}/\d{2}/[^/?#]+)" + example = "https://everia.club/0000/00/00/TITLE" + + def items(self): + url = self.root + self.groups[0] + page = self.request(url).text content = text.extr(page, 'itemprop="text">', "") - urls = re.findall(r'img.*?src=\"(.+?)\"', content) + urls = re.findall(r'img.*?src="([^"]+)', content) data = { "title": text.unescape( - text.extr(page, 'itemprop="headline">', "") - ), - "url": self.url, + text.extr(page, 'itemprop="headline">', "")), "tags": list(text.extract_iter(page, 'rel="tag">', "")), + "post_url": url, "post_category": text.extr( - page, "post-in-category-", " " - ).capitalize(), + page, "post-in-category-", " ").capitalize(), "count": len(urls), } yield Message.Directory, data for data["num"], url in enumerate(urls, 1): - text.nameext_from_url(text.unquote(url), data) - yield Message.Url, url, data + yield Message.Url, url, text.nameext_from_url(url, data) -class EveriaTagExtractor(EveriaPostExtractor): +class EveriaTagExtractor(EveriaExtractor): subcategory = "tag" - pattern = BASE_PATTERN + r"/(tag/[^/]+)/?" + pattern = BASE_PATTERN + r"(/tag/[^/?#]+)" example = "https://everia.club/tag/TAG" - def __init__(self, match): - super().__init__(match) - self.id = match.group(1) - def _posts(self, page): - posts = re.findall(r'thumbnail\">\s* 50", }, + { - "#url" : "https://everia.club/category/japan/", - "#category" : ("", "everia", "category"), - "#class" : everia.EveriaCategoryExtractor, + "#url" : "https://everia.club/category/japan/", + "#class" : everia.EveriaCategoryExtractor, + "#pattern": everia.EveriaPostExtractor.pattern, + "#range" : "1-50", + "#count" : 50, }, + { - "#url" : "https://everia.club/?s=saika", - "#category" : ("", "everia", "search"), - "#class" : everia.EveriaSearchExtractor, -} + "#url" : "https://everia.club/2023/10/05/", + "#class" : everia.EveriaDateExtractor, + "#pattern": everia.EveriaPostExtractor.pattern, + "#count" : 34, +}, + +{ + "#url" : "https://everia.club/?s=saika", + "#class" : everia.EveriaSearchExtractor, + "#pattern": everia.EveriaPostExtractor.pattern, + "#range" : "1-15", + "#count" : 15, +}, + )