gallery-dl/gallery_dl/extractor/manganelo.py

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://manganato.com/"""

from .common import ChapterExtractor, MangaExtractor
from .. import text
import re

BASE_PATTERN = (
    r"(?:https?://)?"
    r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o"
    r"\.(?:to|com))"
)


class ManganeloBase():
    category = "manganelo"
    root = "https://chapmanganato.com"
    _match_chapter = None

    def __init__(self, match):
        domain, path = match.groups()
        super().__init__(match, "https://" + domain + path)

    def _init(self):
        if self._match_chapter is None:
            ManganeloBase._match_chapter = re.compile(
                r"(?:[Vv]ol\.?\s*(\d+)\s?)?"
                r"[Cc]hapter\s*(\d+)([^:]*)"
                r"(?::\s*(.+))?").match

    def _parse_chapter(self, info, manga, author, date=None):
        match = self._match_chapter(info)
        if match:
            volume, chapter, minor, title = match.groups()
        else:
            volume = chapter = minor = ""
            title = info

        return {
            "manga"        : manga,
            "author"       : author,
            "date"         : date,
            "title"        : text.unescape(title) if title else "",
            "volume"       : text.parse_int(volume),
            "chapter"      : text.parse_int(chapter),
            "chapter_minor": minor,
            "lang"         : "en",
            "language"     : "English",
        }


class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):
    """Extractor for manga chapters from manganelo.com"""
    pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)"
    example = "https://chapmanganato.com/manga-ID/chapter-01"

    def metadata(self, page):
        extr = text.extract_from(page)
        extr('class="a-h"', ">")
        manga = extr('title="', '"')
        info = extr('title="', '"')
        author = extr("- Author(s) : ", "</p>")

        return self._parse_chapter(
            info, text.unescape(manga), text.unescape(author))

    def images(self, page):
        page = text.extr(
            page, 'class="container-chapter-reader', 'class="container')
        return [
            (url, None)
            for url in text.extract_iter(page, '<img src="', '"')
            if not url.endswith("/gohome.png")
        ] or [
            (url, None)
            for url in text.extract_iter(
                page, '<img class="reader-content" src="', '"')
        ]


class ManganeloMangaExtractor(ManganeloBase, MangaExtractor):
    """Extractor for manga from manganelo.com"""
    chapterclass = ManganeloChapterExtractor
    pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$"
    example = "https://manganato.com/manga-ID"

    def chapters(self, page):
        results = []
        append = results.append

        extr = text.extract_from(page)
        manga = text.unescape(extr("<h1>", "<"))
        author = text.remove_html(extr("</i>Author(s) :</td>", "</tr>"))

        extr('class="row-content-chapter', '')
        while True:
            url = extr('class="chapter-name text-nowrap" href="', '"')
            if not url:
                return results
            info = extr(">", "<")
            date = extr('class="chapter-time text-nowrap" title="', '"')
            append((url, self._parse_chapter(info, manga, author, date)))
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00			`# -- coding: utf-8 --`

			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[manganelo] update domain to 'manganato.com' 2021-05-28 17:52:30 +02:00			`"""Extractors for https://manganato.com/"""`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00
			`from .common import ChapterExtractor, MangaExtractor`
			`from .. import text`
			`import re`

[manganelo] fix extraction & recognize '.to' TLDs (#5005) 2024-01-01 22:05:21 +01:00			`BASE_PATTERN = (`
			`r"(?:https?://)?"`
			`r"((?:chap\|read\|www\.\|m\.)?mangan(?:at\|el)o"`
			`r"\.(?:to\|com))"`
			`)`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00

[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`class ManganeloBase():`
[manganelo] update domain to 'manganato.com' 2021-05-28 17:52:30 +02:00			`category = "manganelo"`
[manganelo] update domain to 'chapmanganato.com' (#3097) 2022-10-24 14:18:48 +02:00			`root = "https://chapmanganato.com"`
[manganelo] match more minor version separators (#3972) 2023-04-27 13:12:11 +02:00			`_match_chapter = None`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00
			`def __init__(self, match):`
			`domain, path = match.groups()`
			`super().__init__(match, "https://" + domain + path)`
decouple extractor initialization Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'. 2023-07-25 20:09:44 +02:00
			`def _init(self):`
[manganelo] match more minor version separators (#3972) 2023-04-27 13:12:11 +02:00			`if self._match_chapter is None:`
			`ManganeloBase._match_chapter = re.compile(`
			`r"(?:[Vv]ol\.?\s*(\d+)\s?)?"`
			`r"[Cc]hapter\s(\d+)([^:])"`
			`r"(?::\s*(.+))?").match`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00
			`def _parse_chapter(self, info, manga, author, date=None):`
			`match = self._match_chapter(info)`
[manganelo] match more minor version separators (#3972) 2023-04-27 13:12:11 +02:00			`if match:`
			`volume, chapter, minor, title = match.groups()`
			`else:`
			`volume = chapter = minor = ""`
			`title = info`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00
			`return {`
			`"manga" : manga,`
			`"author" : author,`
			`"date" : date,`
			`"title" : text.unescape(title) if title else "",`
			`"volume" : text.parse_int(volume),`
			`"chapter" : text.parse_int(chapter),`
[manganelo] match more minor version separators (#3972) 2023-04-27 13:12:11 +02:00			`"chapter_minor": minor,`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`"lang" : "en",`
			`"language" : "English",`
			`}`


			`class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):`
			`"""Extractor for manga chapters from manganelo.com"""`
[manganelo] update domain to 'manganato.com' 2021-05-28 17:52:30 +02:00			`pattern = BASE_PATTERN + r"(/(?:manga-\w+\|chapter/\w+)/chapter[-_][^/?#]+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://chapmanganato.com/manga-ID/chapter-01"`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00
			`def metadata(self, page):`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`extr = text.extract_from(page)`
			`extr('class="a-h"', ">")`
			`manga = extr('title="', '"')`
			`info = extr('title="', '"')`
			`author = extr("- Author(s) : ", "</p>")`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`return self._parse_chapter(`
			`info, text.unescape(manga), text.unescape(author))`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00
			`def images(self, page):`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`page = text.extr(`
[manganelo] fix extraction & recognize '.to' TLDs (#5005) 2024-01-01 22:05:21 +01:00			`page, 'class="container-chapter-reader', 'class="container')`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00			`return [`
			`(url, None)`
			`for url in text.extract_iter(page, '<img src="', '"')`
[manganelo] fix extraction & recognize '.to' TLDs (#5005) 2024-01-01 22:05:21 +01:00			`if not url.endswith("/gohome.png")`
[manganelo] support mobile-only chapters 2023-02-15 00:02:28 +01:00			`] or [`
			`(url, None)`
			`for url in text.extract_iter(`
			`page, '<img class="reader-content" src="', '"')`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00			`]`


[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`class ManganeloMangaExtractor(ManganeloBase, MangaExtractor):`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00			`"""Extractor for manga from manganelo.com"""`
			`chapterclass = ManganeloChapterExtractor`
[manganelo] update domain to 'manganato.com' 2021-05-28 17:52:30 +02:00			`pattern = BASE_PATTERN + r"(/(?:manga[-/]\|read_)\w+)/?$"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://manganato.com/manga-ID"`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00
			`def chapters(self, page):`
			`results = []`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`append = results.append`

			`extr = text.extract_from(page)`
			`manga = text.unescape(extr("<h1>", "<"))`
			`author = text.remove_html(extr("</i>Author(s) :</td>", "</tr>"))`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`extr('class="row-content-chapter', '')`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00			`while True:`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`url = extr('class="chapter-name text-nowrap" href="', '"')`
Add manganelo extractor (#1415) 2021-04-02 21:01:31 +02:00			`if not url:`
			`return results`
[manganelo] update and fix metadata extraction 2023-02-16 22:31:18 +01:00			`info = extr(">", "<")`
			`date = extr('class="chapter-time text-nowrap" title="', '"')`
			`append((url, self._parse_chapter(info, manga, author, date)))`