gallery-dl/gallery_dl/extractor/batoto.py

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://bato.to/"""

from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception
import re

BASE_PATTERN = (r"(?:https?://)?(?:"
                r"(?:ba|d|h|m|w)to\.to|"
                r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|"
                r"comiko\.(?:net|org)|"
                r"bat(?:otoo|o?two)\.com)")


class BatotoBase():
    """Base class for batoto extractors"""
    category = "batoto"
    root = "https://bato.to"

    def request(self, url, **kwargs):
        kwargs["encoding"] = "utf-8"
        return Extractor.request(self, url, **kwargs)


class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
    """Extractor for bato.to manga chapters"""
    pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
    example = "https://bato.to/title/12345-MANGA/54321"

    def __init__(self, match):
        self.root = text.root_from_url(match.group(0))
        self.chapter_id = match.group(1)
        url = "{}/title/0/{}".format(self.root, self.chapter_id)
        ChapterExtractor.__init__(self, match, url)

    def metadata(self, page):
        extr = text.extract_from(page)
        try:
            manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)
        except ValueError:
            manga = info = None

        manga_id = text.extr(
            extr('rel="canonical" href="', '"'), "/title/", "/")

        if not manga:
            manga = extr('link-hover">', "<")
            info = text.remove_html(extr('link-hover">', "</"))
        info = text.unescape(info)

        match = re.match(
            r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
            r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info)
        if match:
            volume, chapter, minor = match.groups()
        else:
            volume = chapter = 0
            minor = ""

        return {
            "manga"         : text.unescape(manga),
            "manga_id"      : text.parse_int(manga_id),
            "chapter_url"   : extr(self.chapter_id + "-ch_", '"'),
            "title"         : text.unescape(text.remove_html(extr(
                "selected>", "</option")).partition(" : ")[2]),
            "volume"        : text.parse_int(volume),
            "chapter"       : text.parse_int(chapter),
            "chapter_minor" : minor,
            "chapter_string": info,
            "chapter_id"    : text.parse_int(self.chapter_id),
            "date"          : text.parse_timestamp(extr(' time="', '"')[:-3]),
        }

    def images(self, page):
        images_container = text.extr(page, 'pageOpts', ':[0,0]}"')
        images_container = text.unescape(images_container)
        return [
            (url, None)
            for url in text.extract_iter(images_container, r"\"", r"\"")
        ]


class BatotoMangaExtractor(BatotoBase, MangaExtractor):
    """Extractor for bato.to manga"""
    reverse = False
    chapterclass = BatotoChapterExtractor
    pattern = (BASE_PATTERN +
               r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
    example = "https://bato.to/title/12345-MANGA/"

    def __init__(self, match):
        self.root = text.root_from_url(match.group(0))
        self.manga_id = match.group(1) or match.group(2)
        url = "{}/title/{}".format(self.root, self.manga_id)
        MangaExtractor.__init__(self, match, url)

    def chapters(self, page):
        extr = text.extract_from(page)

        warning = extr(' class="alert alert-warning">', "</div><")
        if warning:
            raise exception.StopExtraction("'%s'", text.remove_html(warning))

        data = {
            "manga_id": text.parse_int(self.manga_id),
            "manga"   : text.unescape(extr(
                "<title>", "<").rpartition(" - ")[0]),
        }

        extr('<div data-hk="0-0-0-0"', "")
        results = []
        while True:
            href = extr('<a href="/title/', '"')
            if not href:
                break

            chapter = href.rpartition("-ch_")[2]
            chapter, sep, minor = chapter.partition(".")

            data["chapter"] = text.parse_int(chapter)
            data["chapter_minor"] = sep + minor
            data["date"] = text.parse_datetime(
                extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")

            url = "{}/title/{}".format(self.root, href)
            results.append((url, data.copy()))
        return results
[bato] add support 2023-12-27 04:33:33 +01:00			`# -- coding: utf-8 --`

			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[bato] simplify and update 2024-01-06 00:51:52 +01:00			`"""Extractors for https://bato.to/"""`
[bato] add support 2023-12-27 04:33:33 +01:00
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`from .common import Extractor, ChapterExtractor, MangaExtractor`
[bato] add support 2023-12-27 04:33:33 +01:00			`from .. import text, exception`
			`import re`

[batoto] support more mirror domains (#5042) 2024-01-09 18:02:49 +01:00			`BASE_PATTERN = (r"(?:https?://)?(?:"`
			`r"(?:ba\|d\|h\|m\|w)to\.to\|"`
			`r"(?:(?:manga\|read)toto\|batocomic\|[xz]bato)\.(?:com\|net\|org)\|"`
			`r"comiko\.(?:net\|org)\|"`
			`r"bat(?:otoo\|o?two)\.com)")`
[bato] add support 2023-12-27 04:33:33 +01:00
Fix linting 2023-12-27 05:41:37 +01:00
[bato] rename to 'batoto' to use the same category name as the previous bato.to site 2024-01-06 01:49:34 +01:00			`class BatotoBase():`
			`"""Base class for batoto extractors"""`
			`category = "batoto"`
[bato] add support 2023-12-27 04:33:33 +01:00			`root = "https://bato.to"`

[bato] simplify and update 2024-01-06 00:51:52 +01:00			`def request(self, url, **kwargs):`
			`kwargs["encoding"] = "utf-8"`
			`return Extractor.request(self, url, **kwargs)`

Fix linting 2023-12-27 05:41:37 +01:00
[bato] rename to 'batoto' to use the same category name as the previous bato.to site 2024-01-06 01:49:34 +01:00			`class BatotoChapterExtractor(BatotoBase, ChapterExtractor):`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`"""Extractor for bato.to manga chapters"""`
			`pattern = BASE_PATTERN + r"/(?:title/[^/?#]+\|chapter)/(\d+)"`
			`example = "https://bato.to/title/12345-MANGA/54321"`
[bato] add support 2023-12-27 04:33:33 +01:00
			`def __init__(self, match):`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`self.root = text.root_from_url(match.group(0))`
			`self.chapter_id = match.group(1)`
			`url = "{}/title/0/{}".format(self.root, self.chapter_id)`
			`ChapterExtractor.__init__(self, match, url)`
[bato] add support 2023-12-27 04:33:33 +01:00
			`def metadata(self, page):`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`extr = text.extract_from(page)`
[batoto] fix crash when manga/chapter contains a '-' (#5200) 2024-02-16 00:10:08 +01:00			`try:`
			`manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)`
			`except ValueError:`
			`manga = info = None`

[batoto] improve 'manga_id' extraction (#5042) 2024-01-09 17:25:04 +01:00			`manga_id = text.extr(`
			`extr('rel="canonical" href="', '"'), "/title/", "/")`
[bato] add support 2023-12-27 04:33:33 +01:00
[batoto] fix crash when manga/chapter contains a '-' (#5200) 2024-02-16 00:10:08 +01:00			`if not manga:`
			`manga = extr('link-hover">', "<")`
			`info = text.remove_html(extr('link-hover">', "</"))`
[batoto] add test, improve 'info' handling 2024-08-12 13:24:23 +02:00			`info = text.unescape(info)`
[batoto] fix crash when manga/chapter contains a '-' (#5200) 2024-02-16 00:10:08 +01:00
[bato] add support 2023-12-27 04:33:33 +01:00			`match = re.match(`
[batoto] improve chapter info regex 2 - make regex case-insensitive - match 'Season 12' and 'S12' as volume numbers - match 'Episode 12' as chapter number 2024-08-13 20:16:44 +02:00			`r"(?i)(?:(?:Volume\|S(?:eason)?)\s*(\d+)\s+)?"`
			`r"(?:Chapter\|Episode)\s(\d+)([\w.])", info)`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`if match:`
			`volume, chapter, minor = match.groups()`
			`else:`
			`volume = chapter = 0`
			`minor = ""`
[bato] add support 2023-12-27 04:33:33 +01:00
			`return {`
[batoto] update 2024-08-12 12:22:18 +02:00			`"manga" : text.unescape(manga),`
			`"manga_id" : text.parse_int(manga_id),`
[batoto] return 'chapter_url' as string (#5562) don't try to parse it as integer 2024-09-05 17:47:38 +02:00			`"chapter_url" : extr(self.chapter_id + "-ch_", '"'),`
[batoto] extract 'title' independent of chapter info (#5988) 2024-08-13 20:20:32 +02:00			`"title" : text.unescape(text.remove_html(extr(`
			`"selected>", "</option")).partition(" : ")[2]),`
[batoto] update 2024-08-12 12:22:18 +02:00			`"volume" : text.parse_int(volume),`
			`"chapter" : text.parse_int(chapter),`
			`"chapter_minor" : minor,`
[batoto] add test, improve 'info' handling 2024-08-12 13:24:23 +02:00			`"chapter_string": info,`
[batoto] update 2024-08-12 12:22:18 +02:00			`"chapter_id" : text.parse_int(self.chapter_id),`
			`"date" : text.parse_timestamp(extr(' time="', '"')[:-3]),`
[bato] add support 2023-12-27 04:33:33 +01:00			`}`

			`def images(self, page):`
			`images_container = text.extr(page, 'pageOpts', ':[0,0]}"')`
			`images_container = text.unescape(images_container)`
Fix linting 2023-12-27 05:41:37 +01:00			`return [`
			`(url, None)`
			`for url in text.extract_iter(images_container, r"\"", r"\"")`
			`]`
[bato] add support 2023-12-27 04:33:33 +01:00

[bato] rename to 'batoto' to use the same category name as the previous bato.to site 2024-01-06 01:49:34 +01:00			`class BatotoMangaExtractor(BatotoBase, MangaExtractor):`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`"""Extractor for bato.to manga"""`
[bato] add support 2023-12-27 04:33:33 +01:00			`reverse = False`
[bato] rename to 'batoto' to use the same category name as the previous bato.to site 2024-01-06 01:49:34 +01:00			`chapterclass = BatotoChapterExtractor`
[batoto] improve v2 manga URL pattern and add tests 2024-01-07 22:23:30 +01:00			`pattern = (BASE_PATTERN +`
			`r"/(?:title/(\d+)[^/?#]\|series/(\d+)(?:/[^/?#])?)/?$")`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`example = "https://bato.to/title/12345-MANGA/"`

			`def __init__(self, match):`
			`self.root = text.root_from_url(match.group(0))`
[batoto] improve v2 manga URL pattern and add tests 2024-01-07 22:23:30 +01:00			`self.manga_id = match.group(1) or match.group(2)`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`url = "{}/title/{}".format(self.root, self.manga_id)`
			`MangaExtractor.__init__(self, match, url)`
[bato] add support 2023-12-27 04:33:33 +01:00
			`def chapters(self, page):`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`extr = text.extract_from(page)`
Fix linting 2023-12-27 05:41:37 +01:00
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`warning = extr(' class="alert alert-warning">', "</div><")`
			`if warning:`
			`raise exception.StopExtraction("'%s'", text.remove_html(warning))`

			`data = {`
			`"manga_id": text.parse_int(self.manga_id),`
			`"manga" : text.unescape(extr(`
			`"<title>", "<").rpartition(" - ")[0]),`
			`}`

			`extr('<div data-hk="0-0-0-0"', "")`
[bato] add support 2023-12-27 04:33:33 +01:00			`results = []`
[bato] simplify and update 2024-01-06 00:51:52 +01:00			`while True:`
			`href = extr('<a href="/title/', '"')`
			`if not href:`
			`break`

			`chapter = href.rpartition("-ch_")[2]`
			`chapter, sep, minor = chapter.partition(".")`

			`data["chapter"] = text.parse_int(chapter)`
			`data["chapter_minor"] = sep + minor`
			`data["date"] = text.parse_datetime(`
			`extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")`

			`url = "{}/title/{}".format(self.root, href)`
[bato] add support 2023-12-27 04:33:33 +01:00			`results.append((url, data.copy()))`
Fix linting 2023-12-27 05:41:37 +01:00			`return results`