[foolfuuka] add support for more sites (#18)

- https://arch.b4k.co - https://archive.whatisthisimnotgoodwithcomputers.com - https://archive.yeet.net Notes: - The name "whatisthisimnotgoodwithcomputers" is way too long ... - archive.yeet.net is out of date and also blocked by 4chan servers - newest threads are 2 weeks old - using "https://archive.yeet.net" as Referer header results in "403 Forbidden" when accessing 4chan
2024-11-22 10:42:34 +01:00 · 2017-09-16 21:11:44 +02:00 · 2017-09-16 21:11:44 +02:00 · cebf800a7f
commit cebf800a7f
parent 84d4450410
7 changed files with 89 additions and 5 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -7,6 +7,7 @@ Site                 URL                                 Capabilities
 4chan                https://www.4chan.org/              Threads
 4plebs               https://archive.4plebs.org/         Threads
 8chan                https://8ch.net/                    Threads
 arch.b4k.co          https://arch.b4k.co/                Threads
 Archive of Sins      https://archiveofsins.com/          Threads
 Archived.Moe         https://archived.moe/               Threads
 Batoto               https://bato.to/                    Chapters, Manga                                    Optional
@ -73,6 +74,7 @@ Twitter              https://twitter.com/                Tweets
 Warosu               https://warosu.org/                 Threads
 World Three          http://www.slide.world-three.org/   Chapters, Manga
 Yandere              https://yande.re/                   Pools, Popular Images, Posts, Tag-Searches
 YEET Archive         https://archive.yeet.net/           Threads
 Acidimg              https://acidimg.cc/                 individual Images
 Chronos              http://chronos.to/                  individual Images
 Coreimg              http://coreimg.net/                 individual Images
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -18,6 +18,7 @@ modules = [
    "8chan",
    "archivedmoe",
    "archiveofsins",
    "b4k",
    "batoto",
    "danbooru",
    "desuarchive",
@ -78,8 +79,10 @@ modules = [
    "tumblr",
    "twitter",
    "warosu",
    "whatisthisimnotgoodwithcomputers",
    "worldthree",
    "yandere",
    "yeet",
    "imagehosts",
    "directlink",
    "recursive",
--- a/gallery_dl/extractor/b4k.py
+++ b/gallery_dl/extractor/b4k.py
@ -0,0 +1,24 @@
 # -*- coding: utf-8 -*-
 # Copyright 2017 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extract images from https://arch.b4k.co/"""
 from . import chan
 class BfourkThreadExtractor(chan.FoolfuukaThreadExtractor):
    """Extractor for images from threads on arch.b4k.co"""
    category = "b4k"
    root = "https://arch.b4k.co"
    pattern = [r"(?:https?://)?arch\.b4k\.co/([^/]+)/thread/(\d+)"]
    test = [("http://arch.b4k.co/meta/thread/196/", {
        "url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
    })]
    def remote(self, media):
        return media["remote_media_link"]
--- a/gallery_dl/extractor/chan.py
+++ b/gallery_dl/extractor/chan.py
@ -70,12 +70,14 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
                     "{thread_num} - {title}"]
    filename_fmt = "{media[media]}"
    root = ""
    referer = True
    def __init__(self, match):
        SharedConfigExtractor.__init__(self)
        self.board, self.thread = match.groups()
        self.session.headers["User-Agent"] = "Mozilla 5.0"
-        self.session.headers["Referer"] = self.root
+        if self.referer:
            self.session.headers["Referer"] = self.root
    def items(self):
        op = True
@ -91,9 +93,9 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
            url = media["media_link"]
            if not url and "remote_media_link" in media:
-                needle = '<meta http-equiv="Refresh" content="0; url='
+                url = self.remote(media)
-                page = self.request(media["remote_media_link"]).text
+            if url.startswith("/"):
-                url = text.extract(page, needle, '"')[0]
+                url = self.root + url
            post["extension"] = url.rpartition(".")[2]
            yield Message.Url, url, post
@ -104,7 +106,12 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
        data = self.request(url, params=params).json()[self.thread]
        # sort post-objects by their key
-        posts = sorted(data["posts"].items(), key=operator.itemgetter(0))
+        posts = sorted(data.get("posts", {}).items())
        posts = map(operator.itemgetter(1), posts)
        return itertools.chain((data["op"],), posts)
    def remote(self, media):
        needle = '<meta http-equiv="Refresh" content="0; url='
        page = self.request(media["remote_media_link"]).text
        return text.extract(page, needle, '"')[0]
--- a/gallery_dl/extractor/whatisthisimnotgoodwithcomputers.py
+++ b/gallery_dl/extractor/whatisthisimnotgoodwithcomputers.py
@ -0,0 +1,23 @@
 # -*- coding: utf-8 -*-
 # Copyright 2017 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extract images from https://archive.whatisthisimnotgoodwithcomputers.com"""
 from . import chan
 class WitingwcThreadExtractor(chan.FoolfuukaThreadExtractor):
    """Extractor for archive.whatisthisimnotgoodwithcomputers.com"""
    category = "whatisthisimnotgoodwithcomputers"
    root = "https://archive.whatisthisimnotgoodwithcomputers.com"
    pattern = [r"(?:https?://)?archive\.whatisthisimnotgoodwithcomputers\.com/"
               r"([^/]+)/thread/(\d+)"]
    test = [(("https://archive.whatisthisimnotgoodwithcomputers.com/"
              "ref/thread/1094/"), {
        "url": "cf8f6d4b4950767d2131de308ebc96eec05b04f6",
    })]
--- a/gallery_dl/extractor/yeet.py
+++ b/gallery_dl/extractor/yeet.py
@ -0,0 +1,22 @@
 # -*- coding: utf-8 -*-
 # Copyright 2017 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extract images from https://archive.yeet.net/"""
 from . import chan
 class YeetThreadExtractor(chan.FoolfuukaThreadExtractor):
    """Extractor for images from threads on archive.yeet.net"""
    category = "yeet"
    root = "https://archive.yeet.net"
    pattern = [r"(?:https?://)?archive\.yeet\.net/([^/]+)/thread/(\d+)"]
    test = [("https://archive.yeet.net/yeet/thread/359/", {
        "url": "ced64a1aadaafc4f359ab89d9f801050731803f1",
    })]
    referer = False
--- a/scripts/build_supportedsites.py
+++ b/scripts/build_supportedsites.py
@ -12,6 +12,7 @@ CATEGORY_MAP = {
    "2chan"          : "Futaba Channel",
    "archivedmoe"    : "Archived.Moe",
    "archiveofsins"  : "Archive of Sins",
    "b4k"            : "arch.b4k.co",
    "deviantart"     : "DeviantArt",
    "dokireader"     : "Doki Reader",
    "dynastyscans"   : "Dynasty Reader",
@ -54,6 +55,7 @@ CATEGORY_MAP = {
    "spectrumnexus"  : "Spectrum Nexus",
    "thebarchive"    : "The /b/ Archive",
    "worldthree"     : "World Three",
    "yeet"           : "YEET Archive",
    "yomanga"        : "YoManga",
    "yonkouprod"     : "Yonkou Productions",
 }
@ -85,6 +87,7 @@ AUTH_MAP = {
 IGNORE_LIST = (
    "oauth",
    "whatisthisimnotgoodwithcomputers",
 )