gallery-dl/gallery_dl/extractor/4chan.py

# -*- coding: utf-8 -*-

# Copyright 2015-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.4chan.org/"""

from .common import Extractor, Message
from .. import text


class _4chanThreadExtractor(Extractor):
    """Extractor for 4chan threads"""
    category = "4chan"
    subcategory = "thread"
    directory_fmt = ("{category}", "{board}", "{thread} {title}")
    filename_fmt = "{tim} {filename}.{extension}"
    archive_fmt = "{board}_{thread}_{tim}"
    pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org"
               r"/([^/]+)/thread/(\d+)")
    test = (
        ("https://boards.4chan.org/tg/thread/15396072/", {
            "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
            "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
            "content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec",
        }),
        ("https://boards.4channel.org/tg/thread/15396072/", {
            "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
            "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
        }),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.thread = match.groups()

    def items(self):
        url = "https://a.4cdn.org/{}/thread/{}.json".format(
            self.board, self.thread)
        posts = self.request(url).json()["posts"]
        title = posts[0].get("sub") or text.remove_html(posts[0]["com"])

        data = {
            "board" : self.board,
            "thread": self.thread,
            "title" : text.unescape(title)[:50],
        }

        yield Message.Version, 1
        yield Message.Directory, data
        for post in posts:
            if "filename" in post:
                post.update(data)
                post["extension"] = post["ext"][1:]
                post["filename"] = text.unescape(post["filename"])
                url = "https://i.4cdn.org/{}/{}{}".format(
                    post["board"], post["tim"], post["ext"])
                yield Message.Url, url, post


class _4chanBoardExtractor(Extractor):
    """Extractor for 4chan boards"""
    category = "4chan"
    subcategory = "board"
    pattern = r"(?:https?://)?boards\.4chan(?:nel)?\.org/([^/?&#]+)/\d*$"
    test = ("https://boards.4channel.org/po/", {
        "pattern": _4chanThreadExtractor.pattern,
        "count": ">= 100",
    })

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board = match.group(1)

    def items(self):
        url = "https://a.4cdn.org/{}/threads.json".format(self.board)
        threads = self.request(url).json()

        for page in threads:
            for thread in page["threads"]:
                url = "https://boards.4chan.org/{}/thread/{}/".format(
                    self.board, thread["no"])
                thread["page"] = page["page"]
                thread["_extractor"] = _4chanThreadExtractor
                yield Message.Queue, url, thread
add extractor '4chan' 2015-04-14 15:04:07 +02:00			`# -- coding: utf-8 --`

simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`# Copyright 2015-2019 Mike Fährmann`
add extractor '4chan' 2015-04-14 15:04:07 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[4chan] simplify - remove 'chan.py' - slight adjustments to directory and filenames 2019-11-02 20:11:21 +01:00			`"""Extractors for https://www.4chan.org/"""`
add extractor '4chan' 2015-04-14 15:04:07 +02:00
[4chan] simplify - remove 'chan.py' - slight adjustments to directory and filenames 2019-11-02 20:11:21 +01:00			`from .common import Extractor, Message`
[4chan] unescape filenames 2018-06-12 22:19:13 +02:00			`from .. import text`
add extractor '4chan' 2015-04-14 15:04:07 +02:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
[4chan] simplify - remove 'chan.py' - slight adjustments to directory and filenames 2019-11-02 20:11:21 +01:00			`class _4chanThreadExtractor(Extractor):`
			`"""Extractor for 4chan threads"""`
[chan] update to new format 2015-11-21 03:13:06 +01:00			`category = "4chan"`
[4chan] simplify - remove 'chan.py' - slight adjustments to directory and filenames 2019-11-02 20:11:21 +01:00			`subcategory = "thread"`
			`directory_fmt = ("{category}", "{board}", "{thread} {title}")`
			`filename_fmt = "{tim} {filename}.{extension}"`
			`archive_fmt = "{board}_{thread}_{tim}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org"`
			`r"/([^/]+)/thread/(\d+)")`
			`test = (`
[4chan] support 4channel.org domain 2018-11-21 17:40:38 +01:00			`("https://boards.4chan.org/tg/thread/15396072/", {`
			`"url": "39082ad166161966d7ba8e37f2173a824eb540f0",`
			`"keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",`
			`"content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec",`
			`}),`
			`("https://boards.4channel.org/tg/thread/15396072/", {`
			`"url": "39082ad166161966d7ba8e37f2173a824eb540f0",`
			`"keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
[4chan] unescape filenames 2018-06-12 22:19:13 +02:00
[4chan] simplify - remove 'chan.py' - slight adjustments to directory and filenames 2019-11-02 20:11:21 +01:00			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.board, self.thread = match.groups()`

			`def items(self):`
			`url = "https://a.4cdn.org/{}/thread/{}.json".format(`
			`self.board, self.thread)`
			`posts = self.request(url).json()["posts"]`
			`title = posts[0].get("sub") or text.remove_html(posts[0]["com"])`

			`data = {`
			`"board" : self.board,`
			`"thread": self.thread,`
			`"title" : text.unescape(title)[:50],`
			`}`

			`yield Message.Version, 1`
			`yield Message.Directory, data`
			`for post in posts:`
			`if "filename" in post:`
			`post.update(data)`
			`post["extension"] = post["ext"][1:]`
			`post["filename"] = text.unescape(post["filename"])`
			`url = "https://i.4cdn.org/{}/{}{}".format(`
			`post["board"], post["tim"], post["ext"])`
			`yield Message.Url, url, post`
[4chan] add extractor for entire boards (closes #510) 2019-12-09 20:36:05 +01:00

			`class _4chanBoardExtractor(Extractor):`
			`"""Extractor for 4chan boards"""`
			`category = "4chan"`
			`subcategory = "board"`
			`pattern = r"(?:https?://)?boards\.4chan(?:nel)?\.org/([^/?&#]+)/\d*$"`
			`test = ("https://boards.4channel.org/po/", {`
			`"pattern": _4chanThreadExtractor.pattern,`
			`"count": ">= 100",`
			`})`

			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.board = match.group(1)`

			`def items(self):`
			`url = "https://a.4cdn.org/{}/threads.json".format(self.board)`
			`threads = self.request(url).json()`

			`for page in threads:`
			`for thread in page["threads"]:`
			`url = "https://boards.4chan.org/{}/thread/{}/".format(`
			`self.board, thread["no"])`
			`thread["page"] = page["page"]`
			`thread["_extractor"] = _4chanThreadExtractor`
			`yield Message.Queue, url, thread`