[chevereto] add generic extractors (#4664)

- support jpgfish - support pixl.li / pixl.is (#3179, #4357)
2024-11-22 02:32:33 +01:00 · 2023-10-15 23:50:36 +02:00 · 2023-10-15 23:50:36 +02:00 · 2911ed1240
commit 2911ed1240
parent ade8347ead
6 changed files with 234 additions and 155 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -427,12 +427,6 @@ Consider all sites to be NSFW unless otherwise known.
    <td>Games</td>
    <td></td>
 </tr>
-<tr>
-    <td>JPG Fish</td>
-    <td>https://jpg1.su/</td>
-    <td>Albums, individual Images, User Profiles</td>
-    <td></td>
-</tr>
 <tr>
    <td>Keenspot</td>
    <td>http://www.keenspot.com/</td>
@ -998,6 +992,22 @@ Consider all sites to be NSFW unless otherwise known.
    <td></td>
 </tr>

+<tr>
+    <td colspan="4"><strong>Chevereto Instances</strong></td>
+</tr>
+<tr>
+    <td>JPG Fish</td>
+    <td>https://jpg2.su/</td>
+    <td>Albums, individual Images, User Profiles</td>
+    <td></td>
+</tr>
+<tr>
+    <td>Pixl</td>
+    <td>https://pixl.li/</td>
+    <td>Albums, individual Images, User Profiles</td>
+    <td></td>
+</tr>
+
 <tr>
    <td colspan="4"><strong>Danbooru Instances</strong></td>
 </tr>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -28,6 +28,7 @@ modules = [
    "blogger",
    "bunkr",
    "catbox",
+    "chevereto",
    "comicvine",
    "cyberdrop",
    "danbooru",
@ -73,7 +74,6 @@ modules = [
    "issuu",
    "itaku",
    "itchio",
-    "jpgfish",
    "jschan",
    "kabeuchi",
    "keenspot",
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Chevereto galleries"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class CheveretoExtractor(BaseExtractor):
+    """Base class for chevereto extractors"""
+    basecategory = "chevereto"
+    directory_fmt = ("{category}", "{user}", "{album}",)
+    archive_fmt = "{id}"
+
+    def __init__(self, match):
+        BaseExtractor.__init__(self, match)
+        self.path = match.group(match.lastindex)
+
+    def _pagination(self, url):
+        while url:
+            page = self.request(url).text
+
+            for item in text.extract_iter(
+                    page, '<div class="list-item-image ', 'image-container'):
+                yield text.extr(item, '<a href="', '"')
+
+            url = text.extr(page, '<a data-pagination="next" href="', '" ><')
+
+
+BASE_PATTERN = CheveretoExtractor.update({
+    "jpgfish": {
+        "root": "https://jpg2.su",
+        "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
+    },
+    "pixl": {
+        "root": "https://pixl.li",
+        "pattern": r"pixl\.(?:li|is)",
+    },
+})
+
+
+class CheveretoImageExtractor(CheveretoExtractor):
+    """Extractor for chevereto Images"""
+    subcategory = "image"
+    pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)"
+    example = "https://jpg2.su/img/TITLE.ID"
+
+    def items(self):
+        url = self.root + self.path
+        extr = text.extract_from(self.request(url).text)
+
+        image = {
+            "id"   : self.path.rpartition(".")[2],
+            "url"  : extr('<meta property="og:image" content="', '"'),
+            "album": text.extr(extr("Added to <a", "/a>"), ">", "<"),
+            "user" : extr('username: "', '"'),
+        }
+
+        text.nameext_from_url(image["url"], image)
+        yield Message.Directory, image
+        yield Message.Url, image["url"], image
+
+
+class CheveretoAlbumExtractor(CheveretoExtractor):
+    """Extractor for chevereto Albums"""
+    subcategory = "album"
+    pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
+    example = "https://jpg2.su/album/TITLE.ID"
+
+    def items(self):
+        url = self.root + self.path
+        data = {"_extractor": CheveretoImageExtractor}
+
+        if self.path.endswith("/sub"):
+            albums = self._pagination(url)
+        else:
+            albums = (url,)
+
+        for album in albums:
+            for image in self._pagination(album):
+                yield Message.Queue, image, data
+
+
+class CheveretoUserExtractor(CheveretoExtractor):
+    """Extractor for chevereto Users"""
+    subcategory = "user"
+    pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)"
+    example = "https://jpg2.su/USER"
+
+    def items(self):
+        url = self.root + self.path
+
+        if self.path.endswith("/albums"):
+            data = {"_extractor": CheveretoAlbumExtractor}
+        else:
+            data = {"_extractor": CheveretoImageExtractor}
+
+        for url in self._pagination(url):
+            yield Message.Queue, url, data
--- a/gallery_dl/extractor/jpgfish.py
+++ b/gallery_dl/extractor/jpgfish.py
@ -1,105 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://jpg1.su/"""
-
-from .common import Extractor, Message
-from .. import text
-
-BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)"
-
-
-class JpgfishExtractor(Extractor):
-    """Base class for jpgfish extractors"""
-    category = "jpgfish"
-    root = "https://jpg1.su"
-    directory_fmt = ("{category}", "{user}", "{album}",)
-    archive_fmt = "{id}"
-
-    def _pagination(self, url):
-        while url:
-            page = self.request(url).text
-
-            for item in text.extract_iter(
-                    page, '<div class="list-item-image ', 'image-container'):
-                yield text.extract(item, '<a href="', '"')[0]
-
-            url = text.extract(
-                page, '<a data-pagination="next" href="', '" ><')[0]
-
-
-class JpgfishImageExtractor(JpgfishExtractor):
-    """Extractor for jpgfish Images"""
-    subcategory = "image"
-    pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))"
-    example = "https://jpg1.su/img/TITLE.ID"
-
-    def __init__(self, match):
-        JpgfishExtractor.__init__(self, match)
-        self.path, self.image_id = match.groups()
-
-    def items(self):
-        url = "{}/img/{}".format(self.root, self.path)
-        extr = text.extract_from(self.request(url).text)
-
-        image = {
-            "id"   : self.image_id,
-            "url"  : extr('<meta property="og:image" content="', '"'),
-            "album": text.extract(extr(
-                "Added to <a", "/a>"), ">", "<")[0] or "",
-            "user" : extr('username: "', '"'),
-        }
-
-        text.nameext_from_url(image["url"], image)
-        yield Message.Directory, image
-        yield Message.Url, image["url"], image
-
-
-class JpgfishAlbumExtractor(JpgfishExtractor):
-    """Extractor for jpgfish Albums"""
-    subcategory = "album"
-    pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?"
-    example = "https://jpg1.su/album/TITLE.ID"
-
-    def __init__(self, match):
-        JpgfishExtractor.__init__(self, match)
-        self.album, self.sub_albums = match.groups()
-
-    def items(self):
-        url = "{}/a/{}".format(self.root, self.album)
-        data = {"_extractor": JpgfishImageExtractor}
-
-        if self.sub_albums:
-            albums = self._pagination(url + "/sub")
-        else:
-            albums = (url,)
-
-        for album in albums:
-            for image in self._pagination(album):
-                yield Message.Queue, image, data
-
-
-class JpgfishUserExtractor(JpgfishExtractor):
-    """Extractor for jpgfish Users"""
-    subcategory = "user"
-    pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?"
-    example = "https://jpg1.su/USER"
-
-    def __init__(self, match):
-        JpgfishExtractor.__init__(self, match)
-        self.user, self.albums = match.groups()
-
-    def items(self):
-        url = "{}/{}".format(self.root, self.user)
-
-        if self.albums:
-            url += "/albums"
-            data = {"_extractor": JpgfishAlbumExtractor}
-        else:
-            data = {"_extractor": JpgfishImageExtractor}
-
-        for url in self._pagination(url):
-            yield Message.Queue, url, data
--- a/test/results/jpgfish.py
+++ b/test/results/jpgfish.py
@ -4,15 +4,15 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-from gallery_dl.extractor import jpgfish
+from gallery_dl.extractor import chevereto


 __tests__ = (
 {
-    "#url"     : "https://jpg1.su/img/funnymeme.LecXGS",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
-    "#pattern"     : r"https://simp3\.jpg\.church/images/funnymeme\.jpg",
+    "#url"     : "https://jpg2.su/img/funnymeme.LecXGS",
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
+    "#urls"        : "https://simp3.jpg.church/images/funnymeme.jpg",
    "#sha1_content": "098e5e9b17ad634358426e0ffd1c93871474d13c",

    "album"    : "",
@ -25,125 +25,131 @@ __tests__ = (

 {
    "#url"     : "https://jpg.church/img/auCruA",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
    "#pattern" : r"https://simp2\.jpg\.church/hannahowo_00457\.jpg",

    "album": "401-500",
 },

+{
+    "#url"     : "https://jpg1.su/img/funnymeme.LecXGS",
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
+},
+
 {
    "#url"     : "https://jpeg.pet/img/funnymeme.LecXGS",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
 },

 {
    "#url"     : "https://jpg.pet/img/funnymeme.LecXGS",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
 },

 {
    "#url"     : "https://jpg.fishing/img/funnymeme.LecXGS",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
 },

 {
    "#url"     : "https://jpg.fish/img/funnymeme.LecXGS",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
 },

 {
    "#url"     : "https://jpg.church/img/funnymeme.LecXGS",
-    "#category": ("", "jpgfish", "image"),
-    "#class"   : jpgfish.JpgfishImageExtractor,
+    "#category": ("chevereto", "jpgfish", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
 },

 {
    "#url"     : "https://jpg1.su/album/CDilP/?sort=date_desc&page=1",
-    "#category": ("", "jpgfish", "album"),
-    "#class"   : jpgfish.JpgfishAlbumExtractor,
+    "#category": ("chevereto", "jpgfish", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
    "#count"   : 2,
 },

 {
    "#url"     : "https://jpg.fishing/a/gunggingnsk.N9OOI",
-    "#category": ("", "jpgfish", "album"),
-    "#class"   : jpgfish.JpgfishAlbumExtractor,
+    "#category": ("chevereto", "jpgfish", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
    "#count"   : 114,
 },

 {
    "#url"     : "https://jpg.fish/a/101-200.aNJ6A/",
-    "#category": ("", "jpgfish", "album"),
-    "#class"   : jpgfish.JpgfishAlbumExtractor,
+    "#category": ("chevereto", "jpgfish", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
    "#count"   : 100,
 },

 {
    "#url"     : "https://jpg.church/a/hannahowo.aNTdH/sub",
-    "#category": ("", "jpgfish", "album"),
-    "#class"   : jpgfish.JpgfishAlbumExtractor,
+    "#category": ("chevereto", "jpgfish", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
    "#count"   : 606,
 },

 {
    "#url"     : "https://jpeg.pet/album/CDilP/?sort=date_desc&page=1",
-    "#category": ("", "jpgfish", "album"),
-    "#class"   : jpgfish.JpgfishAlbumExtractor,
+    "#category": ("chevereto", "jpgfish", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
 },

 {
    "#url"     : "https://jpg.pet/album/CDilP/?sort=date_desc&page=1",
-    "#category": ("", "jpgfish", "album"),
-    "#class"   : jpgfish.JpgfishAlbumExtractor,
+    "#category": ("chevereto", "jpgfish", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
 },

 {
    "#url"     : "https://jpg1.su/exearco",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
    "#count"   : 3,
 },

 {
    "#url"     : "https://jpg.church/exearco/albums",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
    "#count"   : 1,
 },

 {
    "#url"     : "https://jpeg.pet/exearco",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
 },

 {
    "#url"     : "https://jpg.pet/exearco",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
 },

 {
    "#url"     : "https://jpg.fishing/exearco",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
 },

 {
    "#url"     : "https://jpg.fish/exearco",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
 },

 {
    "#url"     : "https://jpg.church/exearco",
-    "#category": ("", "jpgfish", "user"),
-    "#class"   : jpgfish.JpgfishUserExtractor,
+    "#category": ("chevereto", "jpgfish", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
 },

 )
--- a/test/results/pixl.py
+++ b/test/results/pixl.py
@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import chevereto
+
+
+__tests__ = (
+{
+    "#url"     : "https://pixl.li/image/894x1023-1c8d6dd3b1b0cd4b0d286b229157a7de.z3DwHB",
+    "#category": ("chevereto", "pixl", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
+    "#urls"        : "https://i.pixl.li/894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de.jpg",
+    "#sha1_content": "3279b86d0ac42348c703770c4781ecdc300fc13c",
+
+    "album": "",
+    "extension": "jpg",
+    "filename": "894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de",
+    "id": "z3DwHB",
+    "url": "https://i.pixl.li/894x1023_1c8d6dd3b1b0cd4b0d286b229157a7de.jpg",
+    "user": "matafaka1",
+},
+
+{
+    "#url"     : "https://pixl.is/image/894x1023-1c8d6dd3b1b0cd4b0d286b229157a7de.z3DwHB",
+    "#category": ("chevereto", "pixl", "image"),
+    "#class"   : chevereto.CheveretoImageExtractor,
+},
+
+{
+    "#url"     : "https://pixl.li/album/estelasaubi.D0bJf",
+    "#category": ("chevereto", "pixl", "album"),
+    "#class"   : chevereto.CheveretoAlbumExtractor,
+    "#pattern" : chevereto.CheveretoImageExtractor.pattern,
+    "#count"   : 173,
+},
+
+{
+    "#url"     : "https://pixl.li/mjstik",
+    "#category": ("chevereto", "pixl", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
+    "#pattern" : chevereto.CheveretoImageExtractor.pattern,
+    "#range"   : "1-20",
+    "#count"   : 20,
+},
+
+{
+    "#url"     : "https://pixl.li/mjstik/albums",
+    "#category": ("chevereto", "pixl", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
+    "#pattern" : chevereto.CheveretoAlbumExtractor.pattern,
+    "#count"   : 285,
+},
+
+{
+    "#url"     : "https://pixl.is/renford/albums",
+    "#category": ("chevereto", "pixl", "user"),
+    "#class"   : chevereto.CheveretoUserExtractor,
+},
+
+)