gallery-dl/gallery_dl/extractor/zerochan.py

# -*- coding: utf-8 -*-

# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.zerochan.net/"""

from .booru import BooruExtractor
from ..cache import cache
from .. import text, util, exception
import collections
import re

BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"


class ZerochanExtractor(BooruExtractor):
    """Base class for zerochan extractors"""
    category = "zerochan"
    root = "https://www.zerochan.net"
    filename_fmt = "{id}.{extension}"
    archive_fmt = "{id}"
    page_start = 1
    per_page = 250
    cookies_domain = ".zerochan.net"
    cookies_names = ("z_id", "z_hash")
    request_interval = (0.5, 1.5)

    def login(self):
        self._logged_in = True
        if self.cookies_check(self.cookies_names):
            return

        username, password = self._get_auth_info()
        if username:
            return self.cookies_update(self._login_impl(username, password))

        self._logged_in = False

    @cache(maxage=90*86400, keyarg=1)
    def _login_impl(self, username, password):
        self.log.info("Logging in as %s", username)

        url = self.root + "/login"
        headers = {
            "Origin"  : self.root,
            "Referer" : url,
        }
        data = {
            "ref"     : "/",
            "name"    : username,
            "password": password,
            "login"   : "Login",
        }

        response = self.request(url, method="POST", headers=headers, data=data)
        if not response.history:
            raise exception.AuthenticationError()

        return response.cookies

    def _parse_entry_html(self, entry_id):
        url = "{}/{}".format(self.root, entry_id)
        extr = text.extract_from(self.request(url).text)

        data = {
            "id"      : text.parse_int(entry_id),
            "author"  : text.parse_unicode_escapes(extr('    "name": "', '"')),
            "file_url": extr('"contentUrl": "', '"'),
            "date"    : text.parse_datetime(extr('"datePublished": "', '"')),
            "width"   : text.parse_int(extr('"width": "', ' ')),
            "height"  : text.parse_int(extr('"height": "', ' ')),
            "size"    : text.parse_bytes(extr('"contentSize": "', 'B')),
            "path"    : text.split_html(extr(
                'class="breadcrumbs', '</nav>'))[2:],
            "uploader": extr('href="/user/', '"'),
            "tags"    : extr('<ul id="tags"', '</ul>'),
            "source"  : text.unescape(text.extr(
                extr('id="source-url"', '</a>'), 'href="', '"')),
        }

        html = data["tags"]
        tags = data["tags"] = []
        for tag in html.split("<li class=")[1:]:
            category = text.extr(tag, '"', '"')
            name = text.extr(tag, 'data-tag="', '"')
            tags.append(category.partition(" ")[0].capitalize() + ":" + name)

        return data

    def _parse_entry_api(self, entry_id):
        url = "{}/{}?json".format(self.root, entry_id)
        text = self.request(url).text
        try:
            item = util.json_loads(text)
        except ValueError as exc:
            if " control character " not in str(exc):
                raise
            text = re.sub(r"[\x00-\x1f\x7f]", "", text)
            item = util.json_loads(text)

        data = {
            "id"      : item["id"],
            "file_url": item["full"],
            "width"   : item["width"],
            "height"  : item["height"],
            "size"    : item["size"],
            "name"    : item["primary"],
            "md5"     : item["hash"],
            "source"  : item.get("source"),
        }

        if not self._logged_in:
            data["tags"] = item["tags"]

        return data

    def _tags(self, post, page):
        tags = collections.defaultdict(list)
        for tag in post["tags"]:
            category, _, name = tag.partition(":")
            tags[category].append(name)
        for key, value in tags.items():
            post["tags_" + key.lower()] = value


class ZerochanTagExtractor(ZerochanExtractor):
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"
    example = "https://www.zerochan.net/TAG"

    def __init__(self, match):
        ZerochanExtractor.__init__(self, match)
        self.search_tag, self.query = match.groups()

    def _init(self):
        if self.config("pagination") == "html":
            self.posts = self.posts_html
            self.per_page = 24
        else:
            self.posts = self.posts_api
            self.session.headers["User-Agent"] = util.USERAGENT

    def metadata(self):
        return {"search_tags": text.unquote(
            self.search_tag.replace("+", " "))}

    def posts_html(self):
        url = self.root + "/" + self.search_tag
        params = text.parse_query(self.query)
        params["p"] = text.parse_int(params.get("p"), self.page_start)
        metadata = self.config("metadata")

        while True:
            page = self.request(url, params=params).text
            thumbs = text.extr(page, '<ul id="thumbs', '</ul>')
            extr = text.extract_from(thumbs)

            while True:
                post = extr('<li class="', '>')
                if not post:
                    break

                if metadata:
                    entry_id = extr('href="/', '"')
                    post = self._parse_entry_html(entry_id)
                    post.update(self._parse_entry_api(entry_id))
                    yield post
                else:
                    yield {
                        "id"    : extr('href="/', '"'),
                        "name"  : extr('alt="', '"'),
                        "width" : extr('title="', '&#10005;'),
                        "height": extr('', ' '),
                        "size"  : extr('', 'b'),
                        "file_url": "https://static." + extr(
                            '<a href="https://static.', '"'),
                    }

            if 'rel="next"' not in page:
                break
            params["p"] += 1

    def posts_api(self):
        url = self.root + "/" + self.search_tag
        metadata = self.config("metadata")
        params = {
            "json": "1",
            "l"   : self.per_page,
            "p"   : self.page_start,
        }

        static = "https://static.zerochan.net/.full."

        while True:
            response = self.request(url, params=params, allow_redirects=False)

            if response.status_code >= 300:
                url = text.urljoin(self.root, response.headers["location"])
                self.log.warning("HTTP redirect to %s", url)
                if self.config("redirects"):
                    continue
                raise exception.StopExtraction()

            data = response.json()
            try:
                posts = data["items"]
            except Exception:
                self.log.debug("Server response: %s", data)
                return

            if metadata:
                for post in posts:
                    post_id = post["id"]
                    post.update(self._parse_entry_html(post_id))
                    post.update(self._parse_entry_api(post_id))
                    yield post
            else:
                for post in posts:
                    base = static + str(post["id"])
                    post["file_url"] = base + ".jpg"
                    post["_fallback"] = (base + ".png",)
                    yield post

            if not data.get("next"):
                return
            params["p"] += 1


class ZerochanImageExtractor(ZerochanExtractor):
    subcategory = "image"
    pattern = BASE_PATTERN + r"/(\d+)"
    example = "https://www.zerochan.net/12345"

    def __init__(self, match):
        ZerochanExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def posts(self):
        post = self._parse_entry_html(self.image_id)
        if self.config("metadata"):
            post.update(self._parse_entry_api(self.image_id))
        return (post,)
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`# -- coding: utf-8 --`

fix 'keywords' in extractor tests (#3491) 2023-01-03 15:14:23 +01:00			`# Copyright 2022-2023 Mike Fährmann`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://www.zerochan.net/"""`

			`from .booru import BooruExtractor`
[zerochan] implement login with username & password (#1434) 2022-07-29 12:49:04 +02:00			`from ..cache import cache`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`from .. import text, util, exception`
[zerochan] implement 'tags' option (#5874) allow splitting tags into separate lists by category 2024-07-23 09:50:06 +02:00			`import collections`
[zerochan] fix 'Invalid control character' errors (#5892) 2024-07-29 11:19:10 +02:00			`import re`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"`


			`class ZerochanExtractor(BooruExtractor):`
			`"""Base class for zerochan extractors"""`
			`category = "zerochan"`
			`root = "https://www.zerochan.net"`
			`filename_fmt = "{id}.{extension}"`
			`archive_fmt = "{id}"`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`page_start = 1`
			`per_page = 250`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`cookies_domain = ".zerochan.net"`
			`cookies_names = ("z_id", "z_hash")`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`request_interval = (0.5, 1.5)`
[zerochan] implement login with username & password (#1434) 2022-07-29 12:49:04 +02:00
			`def login(self):`
[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`self._logged_in = True`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`if self.cookies_check(self.cookies_names):`
			`return`

			`username, password = self._get_auth_info()`
			`if username:`
			`return self.cookies_update(self._login_impl(username, password))`

			`self._logged_in = False`
[zerochan] implement login with username & password (#1434) 2022-07-29 12:49:04 +02:00
			`@cache(maxage=90*86400, keyarg=1)`
			`def _login_impl(self, username, password):`
			`self.log.info("Logging in as %s", username)`

			`url = self.root + "/login"`
			`headers = {`
			`"Origin" : self.root,`
			`"Referer" : url,`
			`}`
			`data = {`
			`"ref" : "/",`
			`"name" : username,`
			`"password": password,`
			`"login" : "Login",`
			`}`

			`response = self.request(url, method="POST", headers=headers, data=data)`
			`if not response.history:`
			`raise exception.AuthenticationError()`

			`return response.cookies`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`def _parse_entry_html(self, entry_id):`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`url = "{}/{}".format(self.root, entry_id)`
			`extr = text.extract_from(self.request(url).text)`

[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`data = {`
			`"id" : text.parse_int(entry_id),`
[zerochan] fix metadata extraction author, path, tags 2023-11-24 21:21:14 +01:00			`"author" : text.parse_unicode_escapes(extr(' "name": "', '"')),`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`"file_url": extr('"contentUrl": "', '"'),`
[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`"date" : text.parse_datetime(extr('"datePublished": "', '"')),`
			`"width" : text.parse_int(extr('"width": "', ' ')),`
			`"height" : text.parse_int(extr('"height": "', ' ')),`
			`"size" : text.parse_bytes(extr('"contentSize": "', 'B')),`
			`"path" : text.split_html(extr(`
[zerochan] fix metadata extraction author, path, tags 2023-11-24 21:21:14 +01:00			`'class="breadcrumbs', '</nav>'))[2:],`
[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`"uploader": extr('href="/user/', '"'),`
			`"tags" : extr('<ul id="tags"', '</ul>'),`
[zerochan] fix 'source' extraction 2024-07-23 09:34:44 +02:00			`"source" : text.unescape(text.extr(`
			`extr('id="source-url"', '</a>'), 'href="', '"')),`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`}`

[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`html = data["tags"]`
			`tags = data["tags"] = []`
			`for tag in html.split("<li class=")[1:]:`
[zerochan] fix tag category extraction (#5874) 2024-07-23 09:15:16 +02:00			`category = text.extr(tag, '"', '"')`
[zerochan] fix metadata extraction author, path, tags 2023-11-24 21:21:14 +01:00			`name = text.extr(tag, 'data-tag="', '"')`
[zerochan] fix tag category extraction (#5874) 2024-07-23 09:15:16 +02:00			`tags.append(category.partition(" ")[0].capitalize() + ":" + name)`
[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00
			`return data`

[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`def _parse_entry_api(self, entry_id):`
[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`url = "{}/{}?json".format(self.root, entry_id)`
[zerochan] fix 'Invalid control character' errors (#5892) 2024-07-29 11:19:10 +02:00			`text = self.request(url).text`
			`try:`
			`item = util.json_loads(text)`
			`except ValueError as exc:`
			`if " control character " not in str(exc):`
			`raise`
			`text = re.sub(r"[\x00-\x1f\x7f]", "", text)`
			`item = util.json_loads(text)`
[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00
			`data = {`
			`"id" : item["id"],`
			`"file_url": item["full"],`
			`"width" : item["width"],`
			`"height" : item["height"],`
			`"size" : item["size"],`
			`"name" : item["primary"],`
			`"md5" : item["hash"],`
			`"source" : item.get("source"),`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`}`

[zerochan] update for layout v3 - remove cookie disabling v3 - fix and improve metadata extraction 2022-12-15 23:22:48 +01:00			`if not self._logged_in:`
			`data["tags"] = item["tags"]`

			`return data`

[zerochan] implement 'tags' option (#5874) allow splitting tags into separate lists by category 2024-07-23 09:50:06 +02:00			`def _tags(self, post, page):`
			`tags = collections.defaultdict(list)`
			`for tag in post["tags"]:`
			`category, _, name = tag.partition(":")`
			`tags[category].append(name)`
			`for key, value in tags.items():`
			`post["tags_" + key.lower()] = value`

[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`class ZerochanTagExtractor(ZerochanExtractor):`
			`subcategory = "tag"`
			`directory_fmt = ("{category}", "{search_tags}")`
			`pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://www.zerochan.net/TAG"`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`def __init__(self, match):`
			`ZerochanExtractor.__init__(self, match)`
			`self.search_tag, self.query = match.groups()`

[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`def _init(self):`
			`if self.config("pagination") == "html":`
			`self.posts = self.posts_html`
			`self.per_page = 24`
			`else:`
			`self.posts = self.posts_api`
			`self.session.headers["User-Agent"] = util.USERAGENT`

[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`def metadata(self):`
			`return {"search_tags": text.unquote(`
			`self.search_tag.replace("+", " "))}`

[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`def posts_html(self):`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`url = self.root + "/" + self.search_tag`
			`params = text.parse_query(self.query)`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`params["p"] = text.parse_int(params.get("p"), self.page_start)`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`metadata = self.config("metadata")`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`while True:`
			`page = self.request(url, params=params).text`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`thumbs = text.extr(page, '<ul id="thumbs', '</ul>')`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00			`extr = text.extract_from(thumbs)`

			`while True:`
			`post = extr('<li class="', '>')`
			`if not post:`
			`break`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00
			`if metadata:`
			`entry_id = extr('href="/', '"')`
			`post = self._parse_entry_html(entry_id)`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`post.update(self._parse_entry_api(entry_id))`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`yield post`
			`else:`
			`yield {`
			`"id" : extr('href="/', '"'),`
			`"name" : extr('alt="', '"'),`
[zerochan] fix skipping every other post 2024-02-15 02:51:01 +01:00			`"width" : extr('title="', '✕'),`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`"height": extr('', ' '),`
[zerochan] fix skipping every other post 2024-02-15 02:51:01 +01:00			`"size" : extr('', 'b'),`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`"file_url": "https://static." + extr(`
			`'<a href="https://static.', '"'),`
			`}`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`if 'rel="next"' not in page:`
			`break`
			`params["p"] += 1`

[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`def posts_api(self):`
			`url = self.root + "/" + self.search_tag`
			`metadata = self.config("metadata")`
			`params = {`
			`"json": "1",`
			`"l" : self.per_page,`
			`"p" : self.page_start,`
			`}`

			`static = "https://static.zerochan.net/.full."`

			`while True:`
[zerochan] fix tag redirections (#5891) 2024-07-26 19:56:39 +02:00			`response = self.request(url, params=params, allow_redirects=False)`
[zerochan] improve redirect handling, add 'redirects' option (#5891) 2024-08-10 11:32:30 +02:00
[zerochan] fix tag redirections (#5891) 2024-07-26 19:56:39 +02:00			`if response.status_code >= 300:`
			`url = text.urljoin(self.root, response.headers["location"])`
[zerochan] improve redirect handling, add 'redirects' option (#5891) 2024-08-10 11:32:30 +02:00			`self.log.warning("HTTP redirect to %s", url)`
			`if self.config("redirects"):`
			`continue`
			`raise exception.StopExtraction()`
[zerochan] fix tag redirections (#5891) 2024-07-26 19:56:39 +02:00
[zerochan] improve redirect handling, add 'redirects' option (#5891) 2024-08-10 11:32:30 +02:00			`data = response.json()`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`try:`
			`posts = data["items"]`
[zerochan] handle "KeyError - 'items'" (#5826) Zerochan sometimes sends an empty response when there are no more accessible posts to be had. 2024-07-05 02:54:56 +02:00			`except Exception:`
			`self.log.debug("Server response: %s", data)`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`return`

			`if metadata:`
			`for post in posts:`
			`post_id = post["id"]`
			`post.update(self._parse_entry_html(post_id))`
			`post.update(self._parse_entry_api(post_id))`
[zerochan] fetch metadata for each post separately (#5869) instead of processing all posts at once before returning any of them 2024-07-20 02:11:27 +02:00			`yield post`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`else:`
			`for post in posts:`
			`base = static + str(post["id"])`
			`post["file_url"] = base + ".jpg"`
			`post["_fallback"] = (base + ".png",)`
[zerochan] fetch metadata for each post separately (#5869) instead of processing all posts at once before returning any of them 2024-07-20 02:11:27 +02:00			`yield post`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00
			`if not data.get("next"):`
			`return`
			`params["p"] += 1`

[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`class ZerochanImageExtractor(ZerochanExtractor):`
			`subcategory = "image"`
			`pattern = BASE_PATTERN + r"/(\d+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://www.zerochan.net/12345"`
[zerochan] add 'tag' and 'image' extractors (#1434) 2022-07-27 22:58:23 +02:00
			`def __init__(self, match):`
			`ZerochanExtractor.__init__(self, match)`
			`self.image_id = match.group(1)`

			`def posts(self):`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`post = self._parse_entry_html(self.image_id)`
			`if self.config("metadata"):`
[zerochan] use API by default (#3669) add 'pagination' option 2024-02-25 00:36:14 +01:00			`post.update(self._parse_entry_api(self.image_id))`
[zerochan] add 'metadata' option (#2861) 2022-09-01 21:44:22 +02:00			`return (post,)`