[2ch] update

- simplify extractor code - more metadata - add tests
2025-01-31 11:41:35 +01:00 · 2024-01-08 02:04:34 +01:00 · 2024-01-08 02:04:34 +01:00 · 68196589c4
commit 68196589c4
parent 6c4abc982e
2 changed files with 115 additions and 44 deletions
--- a/gallery_dl/extractor/2ch.py
+++ b/gallery_dl/extractor/2ch.py
@ -4,81 +4,88 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-"""Extractors for https://www.2ch.hk/"""
+"""Extractors for https://2ch.hk/"""

 from .common import Extractor, Message
-from .. import text
+from .. import text, util


 class _2chThreadExtractor(Extractor):
    """Extractor for 2ch threads"""
    category = "2ch"
    subcategory = "thread"
+    root = "https://2ch.hk"
    directory_fmt = ("{category}", "{board}", "{thread} {title}")
-    filename_fmt = "{file_id} - {filename}.{extension}"
-    archive_fmt = "{board}_{thread}_{file_id}"
-    pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html"
+    filename_fmt = "{tim}{filename:? //}.{extension}"
+    archive_fmt = "{board}_{thread}_{tim}"
+    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
+    example = "https://2ch.hk/a/res/12345.html"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.thread = match.groups()

    def items(self):
-        url = f"https://2ch.hk/{self.board}/res/{self.thread}.json"
-        thread_data = self.request(url).json()
+        url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
+        posts = self.request(url).json()["threads"][0]["posts"]

-        posts = thread_data["threads"][0]["posts"]
-        post = posts[0]
-        title = post.get("subject") or text.remove_html(post["comment"])
+        op = posts[0]
+        title = op.get("subject") or text.remove_html(op["comment"])

-        thread_metadata = {
-            "board": self.board,
+        thread = {
+            "board" : self.board,
            "thread": self.thread,
-            "title": text.unescape(title)[:50],
+            "title" : text.unescape(title)[:50],
        }

-        yield Message.Directory, thread_metadata
+        yield Message.Directory, thread
        for post in posts:
-            if "files" in post and post['files']:
-                for file in post['files']:
-                    file_metadata = {
-                        "post_num": post["num"],
-                        "file_id": file["name"].split('.')[0],
-                        "filename": ".".join(file["fullname"].split('.')[:-1]),
-                        "extension": file["name"].split('.')[-1],
-                    }
-                    file_metadata.update(thread_metadata)
+            files = post.get("files")
+            if files:
+                post["post_name"] = post["name"]
+                post["date"] = text.parse_timestamp(post["timestamp"])
+                del post["files"]
+                del post["name"]

-                    url = f"https://2ch.hk/{file['path']}"
-                    yield Message.Url, url, file_metadata
+                for file in files:
+                    file.update(thread)
+                    file.update(post)
+
+                    file["filename"] = file["fullname"].rpartition(".")[0]
+                    file["tim"], _, file["extension"] = \
+                        file["name"].rpartition(".")
+
+                    yield Message.Url, self.root + file["path"], file


 class _2chBoardExtractor(Extractor):
    """Extractor for 2ch boards"""
    category = "2ch"
    subcategory = "board"
-    pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$"
+    root = "https://2ch.hk"
+    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
+    example = "https://2ch.hk/a/"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board = match.group(1)

-    def get_pages(self):
-        url = f"https://2ch.hk/{self.board}/index.json"
-        index_page = self.request(url).json()
-        pages_total = len(index_page['pages'])
-
-        yield index_page
-        for i in range(1, pages_total):
-            url = f"https://2ch.hk/{self.board}/{i}.json"
-            yield self.request(url).json()
-
-    def get_thread_nums(self):
-        for page in self.get_pages():
-            for thread in page["threads"]:
-                yield thread["thread_num"]
-
    def items(self):
-        for thread_num in self.get_thread_nums():
-            url = f"https://2ch.hk/{self.board}/res/{thread_num}.html"
-            yield Message.Queue, url, {"_extractor": _2chThreadExtractor}
+        # index page
+        url = "{}/{}/index.json".format(self.root, self.board)
+        index = self.request(url).json()
+        index["_extractor"] = _2chThreadExtractor
+        for thread in index["threads"]:
+            url = "{}/{}/res/{}.html".format(
+                self.root, self.board, thread["thread_num"])
+            yield Message.Queue, url, index
+
+        # pages 1..n
+        for n in util.advance(index["pages"], 1):
+            url = "{}/{}/{}.json".format(self.root, self.board, n)
+            page = self.request(url).json()
+            page["_extractor"] = _2chThreadExtractor
+            for thread in page["threads"]:
+                url = "{}/{}/res/{}.html".format(
+                    self.root, self.board, thread["thread_num"])
+                yield Message.Queue, url, page
--- a/test/results/2ch.py
+++ b/test/results/2ch.py
@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+gallery_dl = __import__("gallery_dl.extractor.2ch")
+_2ch = getattr(gallery_dl.extractor, "2ch")
+
+
+__tests__ = (
+{
+    "#url"     : "https://2ch.hk/a/res/6202876.html",
+    "#category": ("", "2ch", "thread"),
+    "#class"   : _2ch._2chThreadExtractor,
+    "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+",
+    "#count"   : range(450, 1000),
+
+    "banned"   : 0,
+    "board"    : "a",
+    "closed"   : 0,
+    "comment"  : str,
+    "date"     : "type:datetime",
+    "displayname": str,
+    "email"    : "",
+    "endless"  : 1,
+    "extension": str,
+    "filename" : str,
+    "fullname" : str,
+    "height"   : int,
+    "lasthit"  : 1705273977,
+    "md5"      : r"re:[0-9a-f]{32}",
+    "name"     : r"re:\d+\.\w+",
+    "num"      : int,
+    "number"   : range(1, 1000),
+    "op"       : 0,
+    "parent"   : int,
+    "path"     : r"re:/a/src/6202876/\d+\.\w+",
+    "post_name": "Аноним",
+    "size"     : int,
+    "sticky"   : 0,
+    "subject"  : str,
+    "thread"   : "6202876",
+    "thumbnail": str,
+    "tim"      : r"re:\d+",
+    "timestamp": int,
+    "title"    : "MP4/WEBM",
+    "tn_height": int,
+    "tn_width" : int,
+    "trip"     : "",
+    "type"     : int,
+    "views"    : int,
+    "width"    : int,
+},
+
+{
+    "#url"     : "https://2ch.hk/a/",
+    "#category": ("", "2ch", "board"),
+    "#class"   : _2ch._2chBoardExtractor,
+    "#pattern" : _2ch._2chThreadExtractor.pattern,
+    "#count"   : range(200, 300),
+},
+
+)