[comicfury] add support

2024-11-25 04:02:32 +01:00 · 2024-01-17 19:52:01 +11:00 · 2024-01-17 19:52:01 +11:00 · cd7cb8c505
commit cd7cb8c505
parent bca9a1a1e5
4 changed files with 256 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -181,6 +181,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Tag Searches</td>
    <td></td>
 </tr>
+<tr>
+    <td>Comicfury</td>
+    <td>https://comicfury.com</td>
+    <td>Comic Issues, Comics</td>
+    <td></td>
+</tr>
 <tr>
    <td>Coomer</td>
    <td>https://coomer.su/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -39,6 +39,7 @@ modules = [
    "cien",
    "civitai",
    "cohost",
+    "comicfury",
    "comicvine",
    "cyberdrop",
    "danbooru",
--- a/gallery_dl/extractor/comicfury.py
+++ b/gallery_dl/extractor/comicfury.py
@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://comicfury.com"""
+
+import re
+import itertools
+from .common import Extractor, Message
+from .. import text
+
+
+CF_DOMAINS = (
+    r"([\w-]+)\.(?:thecomicseries\.com|the-comic\.org"
+    r"|thecomicstrip\.org|webcomic\.ws|cfw\.me)"
+)
+
+
+class ComicfuryExtractor(Extractor):
+    """Base class for ComicFury extractors"""
+    category = "comicfury"
+    directory_fmt = ("{category}", "{comic}")
+    filename_fmt = "{category}_{comic}_{id}_{num:>02}.{extension}"
+    archive_fmt = "{filename}"
+    root = "https://comicfury.com"
+    cookies_domain = "comicfury.com"
+
+    def _init(self):
+        self._search_segments = re.compile(
+            (r'\n *<div class="is--image-segments">\n'
+             r'([\s\S]+?)\n *</div>\n')).search
+
+    def request(self, url, **kwargs):
+        resp = Extractor.request(self, url, **kwargs)
+        if '<div class="nhead">Content Warning</div>' in resp.text:
+            token = self.session.cookies.get(
+                "token", domain=self.cookies_domain)
+            resp = Extractor.request(self, url, method="POST", data={
+                "proceed": "View Webcomic",
+                "token": token,
+            }, **kwargs)
+        return resp
+
+    def _parse_page(self, page):
+        comic_name, pos = text.extract(
+            page, '<h2 class="webcomic-title-content-inner">', '</h2>')
+        relative_id, pos = text.extract(
+            page, 'Comic #', ':', pos)
+        comic, pos = text.extract(
+            page, '<a href="/comicprofile.php?url=', '"', pos)
+
+        relative_id = int(relative_id)
+
+        while True:
+            id, pos = text.extract(
+                page, '<div class="is--comic-page" id="comic-', '"', pos)
+            if not id:
+                break
+            chapter_id, pos = text.extract(
+                page, ' data-chapter-id="', '"', pos)
+            chapter_name, pos = text.extract(
+                page, ' data-chapter-name="', '"', pos)
+            pos = text.extract(
+                page, '<div class="is--title" style="', '"', pos)[1]
+            title, pos = text.extract(page, '>', '</div>', pos)
+
+            segments = self._search_segments(page, pos)
+            pos = segments.end(0)
+            urls = list(text.extract_iter(
+                segments.group(1), '<img src="', '"'))
+
+            data = {
+                "comic_name": text.unescape(comic_name),
+                "comic": comic,
+                "relative_id": relative_id,
+                "id": int(id),
+                "chapter_id": int(chapter_id),
+                "chapter_name": text.unescape(chapter_name),
+                "title": text.unescape(title),
+                "count": len(urls)
+            }
+            yield Message.Directory, data
+            for data["num"], url in enumerate(urls, 1):
+                url = text.unescape(url)
+                yield Message.Url, url, text.nameext_from_url(url, data)
+
+            relative_id += 1
+
+
+class ComicfuryIssueExtractor(ComicfuryExtractor):
+    """Extractor for a single issue URL"""
+    subcategory = "issue"
+    pattern = (r"(?:https?://)?(?:comicfury\.com/read/([\w-]+)(?:/comics?/"
+               r"(first|last|\d+)?)?|" + CF_DOMAINS + r"/comics/"
+               r"(first|1|pl/\d+)?)(?:[?#].*)?$")
+    example = "https://comicfury.com/read/URL/comics/1234"
+
+    def __init__(self, match):
+        ComicfuryExtractor.__init__(self, match)
+        self.comic = match.group(1) or match.group(3)
+        if match.group(1) is not None:
+            self.id = match.group(2) or ""
+        else:
+            id = match.group(4)
+            if id in ("first", "1"):
+                self.id = "first"
+            elif not id:
+                self.id = "last"
+            else:
+                self.id = id[3:]
+
+    def items(self):
+        url = self.root + "/read/" + self.comic + "/comics/" + self.id
+        page = self.request(url).text
+        iter = self._parse_page(page)
+
+        msg, data = next(iter)
+        yield msg, data
+        yield from itertools.islice(iter, data["count"])
+
+
+class ComicfuryComicExtractor(ComicfuryExtractor):
+    """Extractor for an entire comic"""
+    subcategory = "comic"
+    pattern = (r"(?:https?://)?(?:comicfury\.com/comicprofile\.php"
+               r"\?url=([\w-]+)|" + CF_DOMAINS + r")/?(?:[?#].*)?$")
+    example = "https://comicfury.com/comicprofile.php?url=URL"
+
+    def __init__(self, match):
+        ComicfuryExtractor.__init__(self, match)
+        self.comic = match.group(1) or match.group(2)
+
+    def items(self):
+        url = self.root + "/read/" + self.comic + "/comics/first"
+        while True:
+            page = self.request(url).text
+            yield from self._parse_page(page)
+
+            div = text.extr(
+                page, '<div class="final-next-page-link-container">', '</div>')
+            new_url = text.extr(
+                div, '<a href="', '" class="final-next-page-link">')
+            if not new_url:
+                break
+            url = text.urljoin(url, text.unescape(new_url))
--- a/test/results/comicfury.py
+++ b/test/results/comicfury.py
@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import comicfury
+
+
+__tests__ = (
+{
+    "#url"     : "https://rain.thecomicseries.com/comics/pl/73003",
+    "#category": ("", "comicfury", "issue"),
+    "#class"   : comicfury.ComicfuryIssueExtractor,
+    "#count"   : 1,
+    "#urls"    : "https://img.comicfury.com/comics/c8f813e19a0aae0f2a0b57a6b36ceec52058036413.png",
+
+    "comic_name"  : "Rain",
+    "comic"       : "rain",
+    "relative_id" : 6,
+    "id"          : 73003,
+    "chapter_id"  : 2770,
+    "chapter_name": "Ch 1: The New Girl",
+    "title"       : "Chapter 1 - The New Girl",
+},
+
+{
+    "#url"     : "https://grinders.the-comic.org/comics/first",
+    "#category": ("", "comicfury", "issue"),
+    "#class"   : comicfury.ComicfuryIssueExtractor,
+    "#count"   : 1,
+    "#urls"    : "https://img.comicfury.com/comics/184/43571a1579840219f1635377961.png",
+
+    "comic_name"  : "Grinder$",
+    "comic"       : "grinders",
+    "relative_id" : 1,
+    "id"          : 1137093,
+    "chapter_id"  : 48527,
+    "chapter_name": "Foam",
+    "title"       : "Teaser",
+},
+
+{
+    "#url"     : "https://belovedchainscomic.thecomicstrip.org/comics/1",
+    "#category": ("", "comicfury", "issue"),
+    "#class"   : comicfury.ComicfuryIssueExtractor,
+},
+
+{
+    "#url"     : "https://belovedchainscomic.webcomic.ws/comics/",
+    "#category": ("", "comicfury", "issue"),
+    "#class"   : comicfury.ComicfuryIssueExtractor,
+},
+
+{
+    "#url"     : "https://comicfury.com/read/MKsJekyllAndHyde/comic/last",
+    "#category": ("", "comicfury", "issue"),
+    "#class"   : comicfury.ComicfuryIssueExtractor,
+    "#count"   : 1,
+    "#urls"    : "https://img.comicfury.com/comics/222/37111a1634996413b60163f1077624721.png",
+
+    "comic_name"  : "MK's The Strange Case of Dr. Jekyll and Mr. Hyde",
+    "comic"       : "MKsJekyllAndHyde",
+    "relative_id" : 622,
+    "id"          : 1493321,
+    "chapter_id"  : 57040,
+    "chapter_name": "Epilogue 3",
+    "title"       : "THE END",
+},
+
+{
+    "#url"     : "https://comicfury.com/read/rain-tradfr",
+    "#category": ("", "comicfury", "issue"),
+    "#class"   : comicfury.ComicfuryIssueExtractor,
+    "#count"   : 1,
+    "#urls"    : "https://img.comicfury.com/comics/218/49338a1624179795b80143f379314885.jpg",
+
+    "comic_name"  : "Rain, la traduction française",
+    "comic"       : "rain-tradfr",
+    "relative_id" : 1,
+    "id"          : 1381699,
+    "chapter_id"  : 56171,
+    "chapter_name": "Hors Chapitre",
+    "title"       : "RAIN",
+},
+
+{
+    "#url"     : "https://comicfury.com/comicprofile.php?url=lanternsofarcadia",
+    "#category": ("", "comicfury", "comic"),
+    "#class"   : comicfury.ComicfuryComicExtractor,
+    "#range"   : "1-6",
+    "#sha1_url"    : "d4080dcb41f5c019e1ceb450a624041208ccdcb8",
+    "#sha1_content": "0c1937e4d177ce55afbfe30ab9376700c6cf619f",
+},
+
+{
+    "#url"     : "https://bloomer-layout.cfw.me",
+    "#category": ("", "comicfury", "comic"),
+    "#class"   : comicfury.ComicfuryComicExtractor,
+},
+
+)