gallery-dl/gallery_dl/extractor/zerochan.py

# -*- coding: utf-8 -*-

# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.zerochan.net/"""

from .booru import BooruExtractor
from ..cache import cache
from .. import text, exception
from xml.etree import ElementTree


BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"


class ZerochanExtractor(BooruExtractor):
    """Base class for zerochan extractors"""
    category = "zerochan"
    root = "https://www.zerochan.net"
    filename_fmt = "{id}.{extension}"
    archive_fmt = "{id}"
    cookiedomain = ".zerochan.net"
    cookienames = ("z_id", "z_hash")

    def login(self):
        if not self._check_cookies(self.cookienames):
            username, password = self._get_auth_info()
            if username:
                self._update_cookies(self._login_impl(username, password))
        # force legacy layout
        self.session.cookies.set("v3", "0", domain=self.cookiedomain)

    @cache(maxage=90*86400, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        url = self.root + "/login"
        headers = {
            "Origin"  : self.root,
            "Referer" : url,
        }
        data = {
            "ref"     : "/",
            "name"    : username,
            "password": password,
            "login"   : "Login",
        }

        response = self.request(url, method="POST", headers=headers, data=data)
        if not response.history:
            raise exception.AuthenticationError()

        return response.cookies

    def _parse_entry_html(self, entry_id):
        url = "{}/{}".format(self.root, entry_id)
        extr = text.extract_from(self.request(url).text)

        return {
            "id"    : entry_id,
            "author": extr('"author": "', '"'),
            "file_url": extr('"contentUrl": "', '"'),
            "date"  : text.parse_datetime(extr(
                '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
            "width" : extr('"width": "', ' '),
            "height": extr('"height": "', ' '),
            "size"  : text.parse_bytes(extr('"contentSize": "', 'B')),
            "path"  : text.split_html(extr(
                'class="breadcrumbs', '</p>'))[3::2],
            "tags"  : extr('alt="Tags: Anime, ', '"').split(", ")
        }

    def _parse_entry_xml(self, entry_id):
        url = "{}/{}?xml".format(self.root, entry_id)
        item = ElementTree.fromstring(self.request(url).text)[0][-1]
        #  content = item[4].attrib

        return {
            #  "id"    : entry_id,
            #  "file_url": content["url"],
            #  "width" : content["width"],
            #  "height": content["height"],
            #  "size"  : content["filesize"],
            "name"  : item[2].text,
            "tags"  : item[5].text.lstrip().split(", "),
            "md5"   : item[6].text,
        }


class ZerochanTagExtractor(ZerochanExtractor):
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
    test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", {
        "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)",
        "count": "> 24",
        "keywords": {
            "extension": r"re:jpg|png",
            "file_url": "",
            "filename": r"re:Perth.\(Kantai.Collection\).full.\d+",
            "height": r"re:^\d+$",
            "id": r"re:^\d+$",
            "name": "Perth (Kantai Collection)",
            "search_tags": "Perth (Kantai Collection)",
            "size": r"re:^\d+k$",
            "width": r"re:^\d+$",
        },
    })

    def __init__(self, match):
        ZerochanExtractor.__init__(self, match)
        self.search_tag, self.query = match.groups()

    def metadata(self):
        return {"search_tags": text.unquote(
            self.search_tag.replace("+", " "))}

    def posts(self):
        url = self.root + "/" + self.search_tag
        params = text.parse_query(self.query)
        params["p"] = text.parse_int(params.get("p"), 1)
        metadata = self.config("metadata")

        while True:
            page = self.request(url, params=params).text
            thumbs = text.extr(page, '<ul id="thumbs', '</ul>')
            extr = text.extract_from(thumbs)

            while True:
                post = extr('<li class="', '>')
                if not post:
                    break

                if metadata:
                    entry_id = extr('href="/', '"')
                    post = self._parse_entry_html(entry_id)
                    post.update(self._parse_entry_xml(entry_id))
                    yield post
                else:
                    yield {
                        "id"    : extr('href="/', '"'),
                        "name"  : extr('alt="', '"'),
                        "width" : extr('title="', 'x'),
                        "height": extr('', ' '),
                        "size"  : extr('', 'B'),
                        "file_url": "https://static." + extr(
                            '<a href="https://static.', '"'),
                    }

            if 'rel="next"' not in page:
                break
            params["p"] += 1


class ZerochanImageExtractor(ZerochanExtractor):
    subcategory = "image"
    pattern = BASE_PATTERN + r"/(\d+)"
    test = ("https://www.zerochan.net/2920445", {
        "pattern": r"https://static\.zerochan\.net/"
                   r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg",
        "keyword": {
            "author": "YukinoTokisaki",
            "date": "dt:2020-04-24 21:33:44",
            "file_url": str,
            "filename": "Perth.(Kantai.Collection).full.2920445",
            "height": "1366",
            "id": "2920445",
            "size": "1929k",
            "width": "1920",
        },
    })

    def __init__(self, match):
        ZerochanExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def posts(self):
        post = self._parse_entry_html(self.image_id)
        if self.config("metadata"):
            post.update(self._parse_entry_xml(self.image_id))
        return (post,)
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`# -- coding: utf-8 --`

			`# Copyright 2022 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://www.zerochan.net/"""`

			`from .booru import BooruExtractor`
[zerochan] implement login with username & password (#1434) 2022-07-29 12:49:04 +02:00			`from ..cache import cache`
			`from .. import text, exception`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`from xml.etree import ElementTree`

[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"`


			`class ZerochanExtractor(BooruExtractor):`
			`"""Base class for zerochan extractors"""`
			`category = "zerochan"`
			`root = "https://www.zerochan.net"`
			`filename_fmt = "{id}.{extension}"`
			`archive_fmt = "{id}"`
[zerochan] implement login with username & password (#1434) 2022-07-29 12:49:04 +02:00			`cookiedomain = ".zerochan.net"`
			`cookienames = ("z_id", "z_hash")`

			`def login(self):`
			`if not self._check_cookies(self.cookienames):`
			`username, password = self._get_auth_info()`
			`if username:`
			`self._update_cookies(self._login_impl(username, password))`
			`# force legacy layout`
			`self.session.cookies.set("v3", "0", domain=self.cookiedomain)`

			`@cache(maxage=90*86400, keyarg=1)`
			`def _login_impl(self, username, password):`
			`self.log.info("Logging in as %s", username)`

			`url = self.root + "/login"`
			`headers = {`
			`"Origin" : self.root,`
			`"Referer" : url,`
			`}`
			`data = {`
			`"ref" : "/",`
			`"name" : username,`
			`"password": password,`
			`"login" : "Login",`
			`}`

			`response = self.request(url, method="POST", headers=headers, data=data)`
			`if not response.history:`
			`raise exception.AuthenticationError()`

			`return response.cookies`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`def _parse_entry_html(self, entry_id):`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`url = "{}/{}".format(self.root, entry_id)`
			`extr = text.extract_from(self.request(url).text)`

			`return {`
			`"id" : entry_id,`
			`"author": extr('"author": "', '"'),`
			`"file_url": extr('"contentUrl": "', '"'),`
			`"date" : text.parse_datetime(extr(`
			`'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),`
			`"width" : extr('"width": "', ' '),`
			`"height": extr('"height": "', ' '),`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`"size" : text.parse_bytes(extr('"contentSize": "', 'B')),`
[zerochan] extract more metadata for single posts Neither HTML pages nor RSS feed entries have all metadata. It might be necessary to do 1-2 extra HTTP requests to grab everything. 2022-08-14 17:26:29 +02:00			`"path" : text.split_html(extr(`
			`'class="breadcrumbs', '</p>'))[3::2],`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`"tags" : extr('alt="Tags: Anime, ', '"').split(", ")`
			`}`

			`def _parse_entry_xml(self, entry_id):`
			`url = "{}/{}?xml".format(self.root, entry_id)`
			`item = ElementTree.fromstring(self.request(url).text)[0][-1]`
			`# content = item[4].attrib`

			`return {`
			`# "id" : entry_id,`
			`# "file_url": content["url"],`
			`# "width" : content["width"],`
			`# "height": content["height"],`
			`# "size" : content["filesize"],`
			`"name" : item[2].text,`
			`"tags" : item[5].text.lstrip().split(", "),`
			`"md5" : item[6].text,`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`}`


			`class ZerochanTagExtractor(ZerochanExtractor):`
			`subcategory = "tag"`
			`directory_fmt = ("{category}", "{search_tags}")`
			`pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"`
			`test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", {`
			`"pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg\|png)",`
			`"count": "> 24",`
			`"keywords": {`
			`"extension": r"re:jpg\|png",`
			`"file_url": "",`
			`"filename": r"re:Perth.\(Kantai.Collection\).full.\d+",`
			`"height": r"re:^\d+$",`
			`"id": r"re:^\d+$",`
			`"name": "Perth (Kantai Collection)",`
			`"search_tags": "Perth (Kantai Collection)",`
			`"size": r"re:^\d+k$",`
			`"width": r"re:^\d+$",`
			`},`
			`})`

			`def __init__(self, match):`
			`ZerochanExtractor.__init__(self, match)`
			`self.search_tag, self.query = match.groups()`

			`def metadata(self):`
			`return {"search_tags": text.unquote(`
			`self.search_tag.replace("+", " "))}`

			`def posts(self):`
			`url = self.root + "/" + self.search_tag`
			`params = text.parse_query(self.query)`
			`params["p"] = text.parse_int(params.get("p"), 1)`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`metadata = self.config("metadata")`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`while True:`
			`page = self.request(url, params=params).text`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`thumbs = text.extr(page, '<ul id="thumbs', '</ul>')`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`extr = text.extract_from(thumbs)`

			`while True:`
			`post = extr('<li class="', '>')`
			`if not post:`
			`break`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00
			`if metadata:`
			`entry_id = extr('href="/', '"')`
			`post = self._parse_entry_html(entry_id)`
			`post.update(self._parse_entry_xml(entry_id))`
			`yield post`
			`else:`
			`yield {`
			`"id" : extr('href="/', '"'),`
			`"name" : extr('alt="', '"'),`
			`"width" : extr('title="', 'x'),`
			`"height": extr('', ' '),`
			`"size" : extr('', 'B'),`
			`"file_url": "https://static." + extr(`
			`'<a href="https://static.', '"'),`
			`}`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`if 'rel="next"' not in page:`
			`break`
			`params["p"] += 1`


			`class ZerochanImageExtractor(ZerochanExtractor):`
			`subcategory = "image"`
			`pattern = BASE_PATTERN + r"/(\d+)"`
			`test = ("https://www.zerochan.net/2920445", {`
			`"pattern": r"https://static\.zerochan\.net/"`
			`r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg",`
			`"keyword": {`
			`"author": "YukinoTokisaki",`
			`"date": "dt:2020-04-24 21:33:44",`
			`"file_url": str,`
			`"filename": "Perth.(Kantai.Collection).full.2920445",`
			`"height": "1366",`
			`"id": "2920445",`
			`"size": "1929k",`
			`"width": "1920",`
			`},`
			`})`

			`def __init__(self, match):`
			`ZerochanExtractor.__init__(self, match)`
			`self.image_id = match.group(1)`

			`def posts(self):`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`post = self._parse_entry_html(self.image_id)`
			`if self.config("metadata"):`
			`post.update(self._parse_entry_xml(self.image_id))`
			`return (post,)`