gallery-dl/gallery_dl/extractor/2ch.py

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://2ch.hk/"""

from .common import Extractor, Message
from .. import text, util


class _2chThreadExtractor(Extractor):
    """Extractor for 2ch threads"""
    category = "2ch"
    subcategory = "thread"
    root = "https://2ch.hk"
    directory_fmt = ("{category}", "{board}", "{thread} {title}")
    filename_fmt = "{tim}{filename:? //}.{extension}"
    archive_fmt = "{board}_{thread}_{tim}"
    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
    example = "https://2ch.hk/a/res/12345.html"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.thread = match.groups()

    def items(self):
        url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
        posts = self.request(url).json()["threads"][0]["posts"]

        op = posts[0]
        title = op.get("subject") or text.remove_html(op["comment"])

        thread = {
            "board" : self.board,
            "thread": self.thread,
            "title" : text.unescape(title)[:50],
        }

        yield Message.Directory, thread
        for post in posts:
            files = post.get("files")
            if files:
                post["post_name"] = post["name"]
                post["date"] = text.parse_timestamp(post["timestamp"])
                del post["files"]
                del post["name"]

                for file in files:
                    file.update(thread)
                    file.update(post)

                    file["filename"] = file["fullname"].rpartition(".")[0]
                    file["tim"], _, file["extension"] = \
                        file["name"].rpartition(".")

                    yield Message.Url, self.root + file["path"], file


class _2chBoardExtractor(Extractor):
    """Extractor for 2ch boards"""
    category = "2ch"
    subcategory = "board"
    root = "https://2ch.hk"
    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
    example = "https://2ch.hk/a/"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board = match.group(1)

    def items(self):
        # index page
        url = "{}/{}/index.json".format(self.root, self.board)
        index = self.request(url).json()
        index["_extractor"] = _2chThreadExtractor
        for thread in index["threads"]:
            url = "{}/{}/res/{}.html".format(
                self.root, self.board, thread["thread_num"])
            yield Message.Queue, url, index

        # pages 1..n
        for n in util.advance(index["pages"], 1):
            url = "{}/{}/{}.json".format(self.root, self.board, n)
            page = self.request(url).json()
            page["_extractor"] = _2chThreadExtractor
            for thread in page["threads"]:
                url = "{}/{}/res/{}.html".format(
                    self.root, self.board, thread["thread_num"])
                yield Message.Queue, url, page
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00			`# -- coding: utf-8 --`

			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`"""Extractors for https://2ch.hk/"""`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00
			`from .common import Extractor, Message`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`from .. import text, util`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00

			`class _2chThreadExtractor(Extractor):`
			`"""Extractor for 2ch threads"""`
			`category = "2ch"`
			`subcategory = "thread"`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`root = "https://2ch.hk"`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00			`directory_fmt = ("{category}", "{board}", "{thread} {title}")`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`filename_fmt = "{tim}{filename:? //}.{extension}"`
			`archive_fmt = "{board}_{thread}_{tim}"`
			`pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"`
			`example = "https://2ch.hk/a/res/12345.html"`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.board, self.thread = match.groups()`

			`def items(self):`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)`
			`posts = self.request(url).json()["threads"][0]["posts"]`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`op = posts[0]`
			`title = op.get("subject") or text.remove_html(op["comment"])`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`thread = {`
			`"board" : self.board,`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00			`"thread": self.thread,`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`"title" : text.unescape(title)[:50],`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00			`}`

[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`yield Message.Directory, thread`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00			`for post in posts:`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`files = post.get("files")`
			`if files:`
			`post["post_name"] = post["name"]`
			`post["date"] = text.parse_timestamp(post["timestamp"])`
			`del post["files"]`
			`del post["name"]`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`for file in files:`
			`file.update(thread)`
			`file.update(post)`

			`file["filename"] = file["fullname"].rpartition(".")[0]`
			`file["tim"], _, file["extension"] = \`
			`file["name"].rpartition(".")`

			`yield Message.Url, self.root + file["path"], file`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00

			`class _2chBoardExtractor(Extractor):`
			`"""Extractor for 2ch boards"""`
			`category = "2ch"`
			`subcategory = "board"`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`root = "https://2ch.hk"`
			`pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"`
			`example = "https://2ch.hk/a/"`
[2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites 2023-08-17 23:23:22 +02:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.board = match.group(1)`

			`def items(self):`
[2ch] update - simplify extractor code - more metadata - add tests 2024-01-08 02:04:34 +01:00			`# index page`
			`url = "{}/{}/index.json".format(self.root, self.board)`
			`index = self.request(url).json()`
			`index["_extractor"] = _2chThreadExtractor`
			`for thread in index["threads"]:`
			`url = "{}/{}/res/{}.html".format(`
			`self.root, self.board, thread["thread_num"])`
			`yield Message.Queue, url, index`

			`# pages 1..n`
			`for n in util.advance(index["pages"], 1):`
			`url = "{}/{}/{}.json".format(self.root, self.board, n)`
			`page = self.request(url).json()`
			`page["_extractor"] = _2chThreadExtractor`
			`for thread in page["threads"]:`
			`url = "{}/{}/res/{}.html".format(`
			`self.root, self.board, thread["thread_num"])`
			`yield Message.Queue, url, page`