gallery-dl/gallery_dl/extractor/zerochan.py

# -*- coding: utf-8 -*-

# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.zerochan.net/"""

from .booru import BooruExtractor
from .. import text

BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"


class ZerochanExtractor(BooruExtractor):
    """Base class for zerochan extractors"""
    category = "zerochan"
    root = "https://www.zerochan.net"
    filename_fmt = "{id}.{extension}"
    archive_fmt = "{id}"

    def _parse_entry_page(self, entry_id):
        url = "{}/{}".format(self.root, entry_id)
        extr = text.extract_from(self.request(url).text)

        return {
            "id"    : entry_id,
            "author": extr('"author": "', '"'),
            "file_url": extr('"contentUrl": "', '"'),
            "date"  : text.parse_datetime(extr(
                '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
            "width" : extr('"width": "', ' '),
            "height": extr('"height": "', ' '),
            "size"  : extr('"contentSize": "', 'B'),
        }


class ZerochanTagExtractor(ZerochanExtractor):
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
    test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", {
        "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)",
        "count": "> 24",
        "keywords": {
            "extension": r"re:jpg|png",
            "file_url": "",
            "filename": r"re:Perth.\(Kantai.Collection\).full.\d+",
            "height": r"re:^\d+$",
            "id": r"re:^\d+$",
            "name": "Perth (Kantai Collection)",
            "search_tags": "Perth (Kantai Collection)",
            "size": r"re:^\d+k$",
            "width": r"re:^\d+$",
        },
    })

    def __init__(self, match):
        ZerochanExtractor.__init__(self, match)
        self.search_tag, self.query = match.groups()

    def metadata(self):
        return {"search_tags": text.unquote(
            self.search_tag.replace("+", " "))}

    def posts(self):
        url = self.root + "/" + self.search_tag
        params = text.parse_query(self.query)
        params["p"] = text.parse_int(params.get("p"), 1)

        while True:
            page = self.request(url, params=params).text
            thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0]
            extr = text.extract_from(thumbs)

            while True:
                post = extr('<li class="', '>')
                if not post:
                    break
                yield {
                    "id"    : extr('href="/', '"'),
                    "name"  : extr('alt="', '"'),
                    "width" : extr('title="', 'x'),
                    "height": extr('', ' '),
                    "size"  : extr('', 'B'),
                    "file_url": "https://static." + extr(
                        '<a href="https://static.', '"'),
                }

            if 'rel="next"' not in page:
                break
            params["p"] += 1


class ZerochanImageExtractor(ZerochanExtractor):
    subcategory = "image"
    pattern = BASE_PATTERN + r"/(\d+)"
    test = ("https://www.zerochan.net/2920445", {
        "pattern": r"https://static\.zerochan\.net/"
                   r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg",
        "keyword": {
            "author": "YukinoTokisaki",
            "date": "dt:2020-04-24 21:33:44",
            "file_url": str,
            "filename": "Perth.(Kantai.Collection).full.2920445",
            "height": "1366",
            "id": "2920445",
            "size": "1929k",
            "width": "1920",
        },
    })

    def __init__(self, match):
        ZerochanExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def posts(self):
        return (self._parse_entry_page(self.image_id),)