diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 319ebe2d..0bcec2b7 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -8,21 +8,21 @@ """Extractors for 4chan archives based on FoolFuuka""" -from .common import Extractor, Message, generate_extractors +from .common import BaseExtractor, Message from .. import text import itertools -class FoolfuukaExtractor(Extractor): +class FoolfuukaExtractor(BaseExtractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" archive_fmt = "{board[shortname]}_{num}_{timestamp}" external = "default" def __init__(self, match): - Extractor.__init__(self, match) + BaseExtractor.__init__(self, match) self.session.headers["Referer"] = self.root - if self.external == "direct": + if self.category == "b4k": self.remote = self._remote_direct def items(self): @@ -43,7 +43,7 @@ class FoolfuukaExtractor(Extractor): yield Message.Url, url, post def metadata(self): - """ """ + """Return general metadata""" def posts(self): """Return an iterable with all relevant posts""" @@ -59,16 +59,90 @@ class FoolfuukaExtractor(Extractor): return media["remote_media_link"] +BASE_PATTERN = FoolfuukaExtractor.update({ + "4plebs": { + "root": "https://archive.4plebs.org", + "pattern": r"(?:archive\.)?4plebs\.org", + }, + "archivedmoe": { + "root": "https://archived.moe", + }, + "archiveofsins": { + "root": "https://archiveofsins.com", + "pattern": r"(?:www\.)?archiveofsins\.com", + }, + "b4k": { + "root": "https://arch.b4k.co", + }, + "desuarchive": { + "root": "https://desuarchive.org", + }, + "fireden": { + "root": "https://boards.fireden.net", + }, + "nyafuu": { + "root": "https://archive.nyafuu.org", + "pattern": r"(?:archive\.)?nyafuu\.org", + }, + "rbt": { + "root": "https://rbt.asia", + "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", + }, + "thebarchive": { + "root": "https://thebarchive.com", + "pattern": r"thebarchive\.com", + }, +}) + + class FoolfuukaThreadExtractor(FoolfuukaExtractor): """Base extractor for threads on FoolFuuka based boards/archives""" subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", "{thread_num}{title:? - //}") - pattern_fmt = r"/([^/?#]+)/thread/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" + test = ( + ("https://archive.4plebs.org/tg/thread/54059290", { + "url": "07452944164b602502b02b24521f8cee5c484d2a", + }), + ("https://archived.moe/gd/thread/309639/", { + "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", + "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", + }), + ("https://archived.moe/a/thread/159767162/", { + "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", + }), + ("https://archiveofsins.com/h/thread/4668813/", { + "url": "f612d287087e10a228ef69517cf811539db9a102", + "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4", + }), + ("https://arch.b4k.co/meta/thread/196/", { + "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", + }), + ("https://desuarchive.org/a/thread/159542679/", { + "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", + }), + ("https://boards.fireden.net/sci/thread/11264294/", { + "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43", + }), + ("https://archive.nyafuu.org/c/thread/2849220/", { + "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", + }), + ("https://rbt.asia/g/thread/61487650/", { + "url": "61896d9d9a2edb556b619000a308a984307b6d30", + }), + ("https://archive.rebeccablacktech.com/g/thread/61487650/", { + "url": "61896d9d9a2edb556b619000a308a984307b6d30", + }), + ("https://thebarchive.com/b/thread/739772332/", { + "url": "e8b18001307d130d67db31740ce57c8561b5d80c", + }), + ) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board, self.thread = match.groups() + self.board = match.group(match.lastindex-1) + self.thread = match.group(match.lastindex) self.data = None def metadata(self): @@ -78,23 +152,34 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): return self.data["op"] def posts(self): + op = (self.data["op"],) posts = self.data.get("posts") if posts: posts = list(posts.values()) posts.sort(key=lambda p: p["timestamp"]) - else: - posts = () - return itertools.chain((self.data["op"],), posts) + return itertools.chain(op, posts) + return op class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern_fmt = r"/([^/?#]+)/\d*$" + pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" + test = ( + ("https://archive.4plebs.org/tg/"), + ("https://archived.moe/gd/"), + ("https://archiveofsins.com/h/"), + ("https://arch.b4k.co/meta/"), + ("https://desuarchive.org/a/"), + ("https://boards.fireden.net/sci/"), + ("https://archive.nyafuu.org/c/"), + ("https://rbt.asia/g/"), + ("https://thebarchive.com/b/"), + ) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(1) + self.board = match.group(match.lastindex) def items(self): index_base = "{}/_/api/chan/index/?board={}&page=".format( @@ -113,7 +198,7 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): for num, thread in threads.items(): thread["url"] = thread_base + format(num) - thread["_extractor"] = self.childclass + thread["_extractor"] = FoolfuukaThreadExtractor yield Message.Queue, thread["url"], thread @@ -121,15 +206,24 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" subcategory = "search" directory_fmt = ("{category}", "search", "{search}") - pattern_fmt = r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" request_interval = 1.0 + test = ( + ("https://archive.4plebs.org/_/search/text/test/"), + ("https://archived.moe/_/search/text/test/"), + ("https://archiveofsins.com/_/search/text/test/"), + ("https://archiveofsins.com/_/search/text/test/"), + ("https://desuarchive.org/_/search/text/test/"), + ("https://boards.fireden.net/_/search/text/test/"), + ("https://archive.nyafuu.org/_/search/text/test/"), + ("https://rbt.asia/_/search/text/test/"), + ("https://thebarchive.com/_/search/text/test/"), + ) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - board, search = match.groups() - self.params = params = {} - args = search.split("/") + args = match.group(match.lastindex).split("/") key = None for arg in args: @@ -138,6 +232,8 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): key = None else: key = arg + + board = match.group(match.lastindex-1) if board != "_": params["boards"] = board @@ -170,105 +266,3 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): if len(posts) <= 3: return params["page"] += 1 - - -EXTRACTORS = { - "4plebs": { - "name": "_4plebs", - "root": "https://archive.4plebs.org", - "pattern": r"(?:archive\.)?4plebs\.org", - "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", { - "url": "07452944164b602502b02b24521f8cee5c484d2a", - }), - "test-board": ("https://archive.4plebs.org/tg/",), - "test-search": ("https://archive.4plebs.org/_/search/text/test/",), - }, - "archivedmoe": { - "root": "https://archived.moe", - "test-thread": ( - ("https://archived.moe/gd/thread/309639/", { - "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", - "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", - }), - ("https://archived.moe/a/thread/159767162/", { - "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", - }), - ), - "test-board": ("https://archived.moe/gd/",), - "test-search": ("https://archived.moe/_/search/text/test/",), - }, - "archiveofsins": { - "root": "https://archiveofsins.com", - "pattern": r"(?:www\.)?archiveofsins\.com", - "test-thread": ("https://archiveofsins.com/h/thread/4668813/", { - "url": "f612d287087e10a228ef69517cf811539db9a102", - "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4", - }), - "test-board": ("https://archiveofsins.com/h/",), - "test-search": ("https://archiveofsins.com/_/search/text/test/",), - }, - "b4k": { - "root": "https://arch.b4k.co", - "extra": {"external": "direct"}, - "test-thread": ("https://arch.b4k.co/meta/thread/196/", { - "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", - }), - "test-board": ("https://arch.b4k.co/meta/",), - "test-search": ("https://arch.b4k.co/_/search/text/test/",), - }, - "desuarchive": { - "root": "https://desuarchive.org", - "test-thread": ("https://desuarchive.org/a/thread/159542679/", { - "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", - }), - "test-board": ("https://desuarchive.org/a/",), - "test-search": ("https://desuarchive.org/_/search/text/test/",), - }, - "fireden": { - "root": "https://boards.fireden.net", - "test-thread": ("https://boards.fireden.net/sci/thread/11264294/", { - "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43", - }), - "test-board": ("https://boards.fireden.net/sci/",), - "test-search": ("https://boards.fireden.net/_/search/text/test/",), - }, - "nyafuu": { - "root": "https://archive.nyafuu.org", - "pattern": r"(?:archive\.)?nyafuu\.org", - "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", { - "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", - }), - "test-board": ("https://archive.nyafuu.org/c/",), - "test-search": ("https://archive.nyafuu.org/_/search/text/test/",), - }, - "rbt": { - "root": "https://rbt.asia", - "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", - "test-thread": ( - ("https://rbt.asia/g/thread/61487650/", { - "url": "61896d9d9a2edb556b619000a308a984307b6d30", - }), - ("https://archive.rebeccablacktech.com/g/thread/61487650/", { - "url": "61896d9d9a2edb556b619000a308a984307b6d30", - }), - ), - "test-board": ("https://rbt.asia/g/",), - "test-search": ("https://rbt.asia/_/search/text/test/",), - }, - "thebarchive": { - "root": "https://thebarchive.com", - "pattern": r"thebarchive\.com", - "test-thread": ("https://thebarchive.com/b/thread/739772332/", { - "url": "e8b18001307d130d67db31740ce57c8561b5d80c", - }), - "test-board": ("https://thebarchive.com/b/",), - "test-search": ("https://thebarchive.com/_/search/text/test/",), - }, - "_ckey": "childclass", -} - -generate_extractors(EXTRACTORS, globals(), ( - FoolfuukaThreadExtractor, - FoolfuukaBoardExtractor, - FoolfuukaSearchExtractor, -))