[saint] add 'album' and 'media' extractors (#4405, #6324)

2024-11-24 19:52:32 +01:00 · 2024-10-27 22:22:43 +01:00 · 2024-10-27 22:22:43 +01:00 · 10c076e7f2
commit 10c076e7f2
parent 061b27f329
5 changed files with 197 additions and 1 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -787,6 +787,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Posts, Tag Searches</td>
    <td></td>
 </tr>
+<tr>
+    <td>Saint</td>
+    <td>https://saint2.su/</td>
+    <td>Albums, Media Files</td>
+    <td></td>
+</tr>
 <tr>
    <td>Sankaku Channel</td>
    <td>https://sankaku.app/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -139,6 +139,7 @@ modules = [
    "reddit",
    "redgifs",
    "rule34us",
+    "saint",
    "sankaku",
    "sankakucomplex",
    "scrolller",
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@ -46,12 +46,17 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
        for data["num"], file in enumerate(files, 1):
            url = file["file"]
            file.update(data)
+
+            if "extension" not in file:
                text.nameext_from_url(url, file)

            if "name" in file:
                name = file["name"]
                file["name"] = name.rpartition(".")[0] or name
                file["id"] = file["filename"].rpartition("-")[2]
+            elif "id" in file:
+                file["name"] = file["filename"]
+                file["filename"] = "{}-{}".format(file["name"], file["id"])
            else:
                file["name"], sep, file["id"] = \
                    file["filename"].rpartition("-")
--- a/gallery_dl/extractor/saint.py
+++ b/gallery_dl/extractor/saint.py
@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://saint2.su/"""
+
+from .lolisafe import LolisafeAlbumExtractor
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|to)"
+
+
+class SaintAlbumExtractor(LolisafeAlbumExtractor):
+    """Extractor for saint albums"""
+    category = "saint"
+    root = "https://saint2.su"
+    pattern = BASE_PATTERN + r"/a/([^/?#]+)"
+    example = "https://saint2.su/a/ID"
+
+    def fetch_album(self, album_id):
+        # album metadata
+        response = self.request(self.root + "/a/" + album_id)
+        extr = text.extract_from(response.text)
+
+        title = extr("<title>", "<")
+        descr = extr('name="description" content="', '"')
+        files = []
+
+        while True:
+            id2 = extr("/thumbs/", "-")
+            if not id2:
+                break
+            files.append({
+                "id2"  : id2,
+                "date" : text.parse_timestamp(extr("", ".")),
+                "id"   : extr("/embed/", '"'),
+                "size" : text.parse_int(extr('data="', '"')),
+                "file" : text.unescape(extr(
+                    "onclick=\"play(", ")").strip("\"'")),
+                "id_dl": extr("/d/", ")").rstrip("\"'"),
+            })
+
+        return files, {
+            "album_id"     : album_id,
+            "album_name"   : text.unescape(title.rpartition(" - ")[0]),
+            "album_size"   : sum(file["size"] for file in files),
+            "description"  : text.unescape(descr),
+            "count"        : len(files),
+            "_http_headers": {"Referer": response.url}
+        }
+
+
+class SaintMediaExtractor(SaintAlbumExtractor):
+    """Extractor for saint media links"""
+    subcategory = "media"
+    directory_fmt = ("{category}",)
+    pattern = BASE_PATTERN + r"(/(embe)?d/([^/?#]+))"
+    example = "https://saint2.su/embed/ID"
+
+    def fetch_album(self, album_id):
+        try:
+            path, embed, _ = self.groups
+
+            url = self.root + path
+            response = self.request(url)
+            extr = text.extract_from(response.text)
+
+            if embed:
+                file = {
+                    "id"   : album_id,
+                    "id2"  : extr("/thumbs/", "-"),
+                    "date" : text.parse_timestamp(extr("", ".")),
+                    "file" : text.unescape(extr('<source src="', '"')),
+                    "id_dl": extr("/d/", "'"),
+                }
+
+            else:  # /d/
+                file = {
+                    "file"     : text.unescape(extr('<a href="', '"')),
+                    "id_dl"    : album_id,
+                    "name"     : album_id,
+                    "filename" : album_id,
+                    "extension": "mp4",
+                }
+
+            file["_http_headers"] = {"Referer": response.url}
+        except Exception as exc:
+            self.log.error("%s: %s", exc.__class__.__name__, exc)
+            return (), {}
+
+        return (file,), {
+            "album_id"   : "",
+            "album_name" : "",
+            "album_size" : -1,
+            "description": "",
+            "count"      : 1,
+        }
--- a/test/results/saint.py
+++ b/test/results/saint.py
@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import saint
+
+
+__tests__ = (
+{
+    "#url"  : "https://saint2.su/a/2c5iuWHTumH",
+    "#class": saint.SaintAlbumExtractor,
+    "#urls" : (
+        "https://cold1.saint2.cr/videos/3b1ccebf3576f8d5aac3ee0e5a12da95.mp4",
+        "https://cold1.saint2.cr/videos/3b125e3fb4b98693f17d85cb53590215.mp4",
+    ),
+
+    "album_id"   : "2c5iuWHTumH",
+    "album_name" : "animations",
+    "album_size" : 37083862,
+    "count"      : 2,
+    "date"       : "type:datetime",
+    "description": "Descriptions can contain only alphanumeric ASCII characters",
+    "extension"  : "mp4",
+    "file"       : r"re:https://...",
+    "filename"   : {"3b1ccebf3576f8d5aac3ee0e5a12da95-6lC7mKrJst8",
+                    "3b125e3fb4b98693f17d85cb53590215-ze10Ohbpoy5"},
+    "id"         : {"6lC7mKrJst8",
+                    "ze10Ohbpoy5"},
+    "id2"        : {"6712834015d67",
+                    "671284a627e0e"},
+    "id_dl"      : {"M2IxY2NlYmYzNTc2ZjhkNWFhYzNlZTBlNWExMmRhOTUubXA0",
+                    "M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0"},
+    "name"       : {"3b1ccebf3576f8d5aac3ee0e5a12da95",
+                    "3b125e3fb4b98693f17d85cb53590215"},
+    "num"        : {1, 2},
+},
+
+{
+    "#url"  : "https://saint2.su/embed/6lC7mKrJst8",
+    "#class": saint.SaintMediaExtractor,
+    "#urls"        : "https://cold1.saint2.cr/videos/3b1ccebf3576f8d5aac3ee0e5a12da95.mp4",
+    "#sha1_content": "39037a029b3fe96f838b4545316caaa545c84075",
+
+    "count"    : 1,
+    "date"     : "dt:2024-10-18 15:48:16",
+    "extension": "mp4",
+    "file"     : "https://cold1.saint2.cr/videos/3b1ccebf3576f8d5aac3ee0e5a12da95.mp4",
+    "filename" : "3b1ccebf3576f8d5aac3ee0e5a12da95-6lC7mKrJst8",
+    "id"       : "6lC7mKrJst8",
+    "id2"      : "6712834015d67",
+    "id_dl"    : "M2IxY2NlYmYzNTc2ZjhkNWFhYzNlZTBlNWExMmRhOTUubXA0",
+    "name"     : "3b1ccebf3576f8d5aac3ee0e5a12da95",
+    "num"      : 1,
+},
+
+{
+    "#url"  : "https://saint2.su/d/M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+    "#class": saint.SaintMediaExtractor,
+    "#urls" : "https://cold1.saint2.cr/api/download.php?file=M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+
+    "count"    : 1,
+    "extension": "mp4",
+    "file"     : "https://cold1.saint2.cr/api/download.php?file=M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+    "filename" : "M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+    "id"       : "M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+    "id_dl"    : "M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+    "name"     : "M2IxMjVlM2ZiNGI5ODY5M2YxN2Q4NWNiNTM1OTAyMTUubXA0",
+    "num"      : 1,
+},
+
+{
+    "#url"  : "https://saint2.pk/embed/6lC7mKrJst8",
+    "#class": saint.SaintMediaExtractor,
+},
+
+{
+    "#url"  : "https://saint.to/embed/6lC7mKrJst8",
+    "#class": saint.SaintMediaExtractor,
+},
+
+)