From c8e5b2e89eaffec912b37a56cfa42482cc2105b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 29 Jun 2015 23:09:35 +0200 Subject: [PATCH 01/15] base class for futaba-chan boards with api --- gallery_dl/extractor/chan.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 gallery_dl/extractor/chan.py diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py new file mode 100644 index 00000000..cb336774 --- /dev/null +++ b/gallery_dl/extractor/chan.py @@ -0,0 +1,47 @@ + +# -*- coding: utf-8 -*- + +# Copyright 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Base classes for extractors for different Futaba Channel boards""" + +from .common import SequentialExtractor, Message + +class ChanExtractor(SequentialExtractor): + + api_url = "" + file_url = "" + + def __init__(self, config, category, board, thread): + SequentialExtractor.__init__(self, config) + self.metadata = { + "category": category, + "board": board, + "thread": thread, + } + + def items(self): + yield Message.Version, 1 + posts = self.request(self.api_url.format(**self.metadata)).json()["posts"] + self.metadata["title"] = self.get_thread_title(posts[0]) + yield Message.Directory, self.metadata + for post in posts: + if "filename" not in post: + continue + post.update(self.metadata) + yield Message.Url, self.file_url.format(**post), post + + @staticmethod + def get_thread_title(post): + """Return thread title from first post""" + if "sub" in post: + return post["sub"] + com = post["com"] + pos = com.find("
") + if pos == -1: + return com + return com[:min(pos, 50)] From c9ef181b3cd9890b8697a382c75270d8b87006a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 29 Jun 2015 23:14:35 +0200 Subject: [PATCH 02/15] [4chan] use api --- gallery_dl/extractor/4chan.py | 58 ++++++----------------------------- 1 file changed, 9 insertions(+), 49 deletions(-) diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index 7d6c826b..74d1c867 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -8,65 +8,25 @@ """Extract image- and video-urls from threads on https://www.4chan.org/""" -from .common import SequentialExtractor, Message -from urllib.parse import unquote -import re +from .chan import ChanExtractor info = { "category": "4chan", "extractor": "FourChanExtractor", "directory": ["{category}", "{board}-{thread-id}"], - "filename": "{timestamp}-{name}", + "filename": "{time}-{filename}{ext}", "pattern": [ r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*", ], } -class FourChanExtractor(SequentialExtractor): +class FourChanExtractor(ChanExtractor): - url_fmt = "https://boards.4chan.org/{0}/res/{1}.html" - regex = ( - r'(?P[^<]+) ' - r'\((?P[^,]+), (?P\d+)x(?P\d+)\)' - ) + api_url = "https://a.4cdn.org/{board}/thread/{thread}.json" + file_url = "https://i.4cdn.org/{board}/{tim}{ext}" def __init__(self, match, config): - SequentialExtractor.__init__(self, config) - self.match = match - self.metadata = None - - def items(self): - yield Message.Version, 1 - - url = self.url_fmt.format(*self.match.groups()) - text = self.request(url).text - self.metadata = self.get_job_metadata(text) - - yield Message.Directory, self.metadata - for match in re.finditer(self.regex, text): - yield Message.Url, self.get_file_url(match), self.get_file_metadata(match) - - def get_job_metadata(self, text): - """Collect metadata for extractor-job""" - board, thread_id = self.match.groups() - title, _ = self.extract(text, '"description" content="', ' - "/') - return { - "category": info["category"], - "board": board, - "thread-id": thread_id, - "title": unquote(title), - } - - def get_file_metadata(self, match): - """Collect metadata for a downloadable file""" - data = self.metadata - data.update(match.groupdict(default="")) - data["name"] = unquote(data["orig_name"] or data["name"]) - return data - - @staticmethod - def get_file_url(match): - """Extract download-url from 'match'""" - return "https:" + match.group("url") + ChanExtractor.__init__( + self, config, info["category"], + match.group(1), match.group(2) + ) From 1998ec9b131dce38b652af06447ff667f01839bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 7 Sep 2015 13:48:16 +0200 Subject: [PATCH 03/15] [pixiv] update user-agent to newest version --- gallery_dl/extractor/pixiv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 0c51cd40..71674899 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -146,7 +146,7 @@ class PixivAPI(): self.session = session self.session.headers.update({ "Referer": "http://www.pixiv.net/", - "User-Agent": "PixivIOSApp/5.1.1", + "User-Agent": "PixivIOSApp/5.8.0", # "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU", }) From d8ef128e74a83e48bad3e8a35b7af8c9e13382a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 7 Sep 2015 13:49:47 +0200 Subject: [PATCH 04/15] [4chan] update default filename and directory --- gallery_dl/extractor/4chan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index 74d1c867..028ab7de 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -13,8 +13,8 @@ from .chan import ChanExtractor info = { "category": "4chan", "extractor": "FourChanExtractor", - "directory": ["{category}", "{board}-{thread-id}"], - "filename": "{time}-{filename}{ext}", + "directory": ["{category}", "{board}-{thread}"], + "filename": "{tim}-{filename}{ext}", "pattern": [ r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*", ], From d7e0d81bddcbef1b9697c8d221bbec805c77ab8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 7 Sep 2015 16:32:20 +0200 Subject: [PATCH 05/15] [8chan] use api --- gallery_dl/extractor/8chan.py | 62 +++++++---------------------------- gallery_dl/extractor/chan.py | 13 ++++---- 2 files changed, 18 insertions(+), 57 deletions(-) diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index d56e5e6f..559951fa 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -8,65 +8,25 @@ """Extract image- and video-urls from threads on https://8ch.net/""" -from .common import SequentialExtractor, Message -from urllib.parse import unquote -import re +from .chan import ChanExtractor info = { "category": "8chan", "extractor": "InfinityChanExtractor", - "directory": ["{category}", "{board}-{thread-id}"], - "filename": "{timestamp}-{name}", + "directory": ["{category}", "{board}-{thread}"], + "filename": "{tim}-{filename}{ext}", "pattern": [ - r"(?:https?://)?(?:www\.)?(?:8chan\.co|8ch\.net)/([^/]+/res/\d+).*", + r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*", ], } -class InfinityChanExtractor(SequentialExtractor): +class InfinityChanExtractor(ChanExtractor): - url_base = "https://8ch.net" - url_fmt = url_base + "/{board}/res/{thread-id}.html" - regex = ( - r'>File: ([^<]+)\.[^<]+<.*?' - r'([^<]+)<' - ) + api_url = "https://8ch.net/{board}/res/{thread}.json" + file_url = "https://media.8ch.net/{board}/src/{tim}{ext}" def __init__(self, match, config): - SequentialExtractor.__init__(self, config) - self.match = match - - def items(self): - yield Message.Version, 1 - - metadata = self.get_job_metadata() - yield Message.Directory, metadata - - url = self.url_fmt.format(**metadata) - text = self.request(url).text - for match in re.finditer(self.regex, text): - yield Message.Url, self.get_file_url(match), self.get_file_metadata(match) - - def get_job_metadata(self): - """Collect metadata for extractor-job""" - board, _, thread_id = self.match.group(1).split("/") - return { - "category": info["category"], - "board": board, - "thread-id": thread_id, - } - - @staticmethod - def get_file_metadata(match): - """Collect metadata for a downloadable file""" - return { - "timestamp": match.group(2), - "name": unquote(match.group(4) or match.group(5)), - } - - def get_file_url(self, match): - """Extract download-url from 'match'""" - url = match.group(1) - if url.startswith("/"): - url = self.url_base + url - return url - + ChanExtractor.__init__( + self, config, info["category"], + match.group(1), match.group(2) + ) diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py index cb336774..2f943068 100644 --- a/gallery_dl/extractor/chan.py +++ b/gallery_dl/extractor/chan.py @@ -1,4 +1,3 @@ - # -*- coding: utf-8 -*- # Copyright 2015 Mike Fährmann @@ -10,6 +9,7 @@ """Base classes for extractors for different Futaba Channel boards""" from .common import SequentialExtractor, Message +import re class ChanExtractor(SequentialExtractor): @@ -34,14 +34,15 @@ class ChanExtractor(SequentialExtractor): continue post.update(self.metadata) yield Message.Url, self.file_url.format(**post), post + if "extra_files" in post: + for file in post["extra_files"]: + post.update(file) + yield Message.Url, self.file_url.format(**post), post @staticmethod def get_thread_title(post): """Return thread title from first post""" if "sub" in post: return post["sub"] - com = post["com"] - pos = com.find("
") - if pos == -1: - return com - return com[:min(pos, 50)] + com = re.sub("<[^>]+?>", "", post["com"]) + return " ".join(com.split())[:50] From bc22f2bd3ab3d1f0ec55ca9b3993e84701716f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Oct 2015 14:55:55 +0200 Subject: [PATCH 06/15] update .gitignore --- .gitignore | 55 +++++++++++++++++++++++- gallery_dl.egg-info/PKG-INFO | 23 ---------- gallery_dl.egg-info/SOURCES.txt | 35 --------------- gallery_dl.egg-info/dependency_links.txt | 1 - gallery_dl.egg-info/entry_points.txt | 3 -- gallery_dl.egg-info/requires.txt | 1 - gallery_dl.egg-info/top_level.txt | 1 - 7 files changed, 54 insertions(+), 65 deletions(-) delete mode 100644 gallery_dl.egg-info/PKG-INFO delete mode 100644 gallery_dl.egg-info/SOURCES.txt delete mode 100644 gallery_dl.egg-info/dependency_links.txt delete mode 100644 gallery_dl.egg-info/entry_points.txt delete mode 100644 gallery_dl.egg-info/requires.txt delete mode 100644 gallery_dl.egg-info/top_level.txt diff --git a/.gitignore b/.gitignore index 12c84c56..ba746605 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,57 @@ +# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ build/ -dist/ \ No newline at end of file +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO deleted file mode 100644 index 74452230..00000000 --- a/gallery_dl.egg-info/PKG-INFO +++ /dev/null @@ -1,23 +0,0 @@ -Metadata-Version: 1.1 -Name: gallery-dl -Version: 0.2 -Summary: gallery- and image downloader -Home-page: https://github.com/mikf/gallery-dl -Author: Mike Fährmann -Author-email: mike_faehrmann@web.de -License: GPLv2 -Description: download image galleries from several image hosting platforms -Platform: UNKNOWN -Classifier: Development Status :: 3 - Alpha -Classifier: Environment :: Console -Classifier: Intended Audience :: End Users/Desktop -Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2) -Classifier: Operating System :: POSIX -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.2 -Classifier: Programming Language :: Python :: 3.3 -Classifier: Programming Language :: Python :: 3.4 -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search -Classifier: Topic :: Multimedia -Classifier: Topic :: Multimedia :: Graphics diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt deleted file mode 100644 index 05ff4da8..00000000 --- a/gallery_dl.egg-info/SOURCES.txt +++ /dev/null @@ -1,35 +0,0 @@ -setup.py -bin/gallery-dl -gallery_dl/__init__.py -gallery_dl/download.py -gallery_dl.egg-info/PKG-INFO -gallery_dl.egg-info/SOURCES.txt -gallery_dl.egg-info/dependency_links.txt -gallery_dl.egg-info/entry_points.txt -gallery_dl.egg-info/requires.txt -gallery_dl.egg-info/top_level.txt -gallery_dl/downloader/__init__.py -gallery_dl/downloader/common.py -gallery_dl/downloader/http.py -gallery_dl/downloader/https.py -gallery_dl/downloader/text.py -gallery_dl/extractor/3dbooru.py -gallery_dl/extractor/4chan.py -gallery_dl/extractor/8chan.py -gallery_dl/extractor/__init__.py -gallery_dl/extractor/batoto.py -gallery_dl/extractor/booru.py -gallery_dl/extractor/common.py -gallery_dl/extractor/danbooru.py -gallery_dl/extractor/e621.py -gallery_dl/extractor/exhentai.py -gallery_dl/extractor/gelbooru.py -gallery_dl/extractor/imagebam.py -gallery_dl/extractor/imgbox.py -gallery_dl/extractor/imgchili.py -gallery_dl/extractor/mangareader.py -gallery_dl/extractor/nijie.py -gallery_dl/extractor/pixiv.py -gallery_dl/extractor/redhawkscans.py -gallery_dl/extractor/sankaku.py -gallery_dl/extractor/yandere.py \ No newline at end of file diff --git a/gallery_dl.egg-info/dependency_links.txt b/gallery_dl.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/gallery_dl.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/gallery_dl.egg-info/entry_points.txt b/gallery_dl.egg-info/entry_points.txt deleted file mode 100644 index 53cf5106..00000000 --- a/gallery_dl.egg-info/entry_points.txt +++ /dev/null @@ -1,3 +0,0 @@ -[console_scripts] -gallery-dl = gallery_dl:main - diff --git a/gallery_dl.egg-info/requires.txt b/gallery_dl.egg-info/requires.txt deleted file mode 100644 index d48cd089..00000000 --- a/gallery_dl.egg-info/requires.txt +++ /dev/null @@ -1 +0,0 @@ -requests >= 2.0 diff --git a/gallery_dl.egg-info/top_level.txt b/gallery_dl.egg-info/top_level.txt deleted file mode 100644 index 9e5039cb..00000000 --- a/gallery_dl.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -gallery_dl From c5801c9770d1da7a83e3b0a2fc527dbf1b3b06f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Oct 2015 12:53:45 +0200 Subject: [PATCH 07/15] combine text related functions in new module --- gallery_dl/text.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 gallery_dl/text.py diff --git a/gallery_dl/text.py b/gallery_dl/text.py new file mode 100644 index 00000000..47fd7258 --- /dev/null +++ b/gallery_dl/text.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Collection of functions that work in strings/text""" + +import re +import html.parser +import urllib.parse +import platform + +def remove_html(text): + """Remove html-tags from a string""" + return " ".join(re.sub("<[^>]+?>", " ", text).split()) + +def filename_from_url(url): + """Extract the last part of an url to use as a filename""" + try: + path = urllib.parse.urlparse(url).path + pos = path.rindex("/") + return path[pos+1:] + except ValueError: + return url + +def clean_path_windows(path): + """Remove illegal characters from a path-segment (Windows)""" + return re.sub(r'[<>:"\\/|?*]', "_", path) + +def clean_path_posix(path): + """Remove illegal characters from a path-segment (Posix)""" + return path.replace("/", "_") + +def extract(txt, begin, end, pos=0): + try: + first = txt.index(begin, pos) + len(begin) + last = txt.index(end, first) + return txt[first:last], last+len(end) + except ValueError: + return None, pos + +def extract_all(txt, begin, end, pos=0): + try: + first = txt.index(begin, pos) + last = txt.index(end, first + len(begin)) + len(end) + return txt[first:last], last + except ValueError: + return None, pos + +if platform.system() == "Windows": + clean_path = clean_path_windows +else: + clean_path = clean_path_posix + +unquote = urllib.parse.unquote + +unescape = html.parser.HTMLParser().unescape From 2962bf36f606d97ee0f3f9ca65227cf26c852ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Oct 2015 14:51:13 +0200 Subject: [PATCH 08/15] add tests for text-module --- setup.py | 1 + test/__init__.py | 0 test/test_text.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 test/__init__.py create mode 100644 test/test_text.py diff --git a/setup.py b/setup.py index db9adead..fe2d134e 100644 --- a/setup.py +++ b/setup.py @@ -46,4 +46,5 @@ setup( "Topic :: Multimedia", "Topic :: Multimedia :: Graphics", ], + test_suite='test', ) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/test_text.py b/test/test_text.py new file mode 100644 index 00000000..91e0097e --- /dev/null +++ b/test/test_text.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +import gallery_dl.text as text + +class TestText(unittest.TestCase): + + def test_remove_html(self): + cases = ( + "Hello World.", + " Hello World. ", + "Hello
World.", + "
HelloWorld.
" + ) + result = "Hello World." + for case in cases: + self.assertEqual(text.remove_html(case), result) + + def test_filename_from_url(self): + cases = ( + "http://example.org/v2/filename.ext", + "http://example.org/v2/filename.ext?param=value#fragment", + "example.org/filename.ext", + "/filename.ext", + "filename.ext", + ) + result = "filename.ext" + for case in cases: + self.assertEqual(text.filename_from_url(case), result) + + def test_clean_path(self): + cases = { + "Hello World." : ("Hello World.", "Hello World."), + "Hello/World/.": ("Hello_World_.", "Hello_World_."), + r':|"World\*?': ( + '_Hello____World___', r':|"World\*?' + ), + } + for case, result in cases.items(): + self.assertEqual(text.clean_path_windows(case), result[0]) + self.assertEqual(text.clean_path_posix (case), result[1]) + +if __name__ == '__main__': + unittest.main() From 42b8e81a680628dc0a4b36fc9a329fb25a9e5010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Oct 2015 15:43:02 +0200 Subject: [PATCH 09/15] rewrite extractors to use text-module --- gallery_dl/extractor/batoto.py | 26 ++++++++++++-------------- gallery_dl/extractor/booru.py | 12 +++++------- gallery_dl/extractor/chan.py | 4 ++-- gallery_dl/extractor/common.py | 24 ------------------------ gallery_dl/extractor/imagebam.py | 25 ++++++++++++------------- gallery_dl/extractor/imgbox.py | 17 +++++++++-------- gallery_dl/extractor/imgchili.py | 7 +++---- gallery_dl/extractor/mangareader.py | 12 +++++------- gallery_dl/extractor/nijie.py | 6 +++--- gallery_dl/extractor/pixiv.py | 14 +++++++------- gallery_dl/extractor/redhawkscans.py | 17 ++++++++--------- 11 files changed, 66 insertions(+), 98 deletions(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index ac363052..65bc7c3d 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -8,10 +8,8 @@ """Extract manga pages from http://bato.to/""" -from .common import AsynchronousExtractor -from .common import Message -from .common import filename_from_url, unescape -from urllib.parse import unquote +from .common import AsynchronousExtractor, Message +from .. import text import os.path import re @@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor): def get_page_metadata(self, page_url): """Collect next url and metadata for one manga-page""" page = self.request(page_url).text - _ , pos = self.extract(page, 'selected="selected"', '') - title, pos = self.extract(page, ': ', '<', pos) - _ , pos = self.extract(page, 'selected="selected"', '', pos) - trans, pos = self.extract(page, '>', '<', pos) - _ , pos = self.extract(page, '
', '<', pos) + _ , pos = text.extract(page, '
(.+) - (?:vol (\d+) )?" r"ch (\d+)[^ ]+ Page (\d+) | Batoto!", @@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor): r"(.+) - ([^ ]+)", trans ) - filename = unquote(filename_from_url(image)) + filename = text.unquote(text.filename_from_url(image)) name, ext = os.path.splitext(filename) return url, { "category": info["category"], "chapter-id": self.chapter_id, - "manga": unescape(mmatch.group(1)), + "manga": text.unescape(mmatch.group(1)), "volume": mmatch.group(2) or "", "chapter": mmatch.group(3), "page": mmatch.group(4), "group": tmatch.group(1), "language": tmatch.group(2), - "title": unescape(title), + "title": text.unescape(title), "image-url": image, "name": name, "extension": ext[1:], diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 88600397..f72bc789 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -8,15 +8,13 @@ """Base classes for extractors for danbooru and co""" -from .common import SequentialExtractor -from .common import Message -from .common import filename_from_url +from .common import SequentialExtractor, Message +from .. import text import xml.etree.ElementTree as ET import json import os.path import urllib.parse - class BooruExtractor(SequentialExtractor): api_url = "" @@ -24,7 +22,7 @@ class BooruExtractor(SequentialExtractor): def __init__(self, match, config, info): SequentialExtractor.__init__(self, config) self.info = info - self.tags = urllib.parse.unquote(match.group(1)) + self.tags = text.unquote(match.group(1)) self.page = "page" self.params = {"tags": self.tags} self.headers = {} @@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor): def get_file_metadata(self, data): """Collect metadata for a downloadable file""" data["category"] = self.info["category"] - data["name"] = urllib.parse.unquote( - filename_from_url(self.get_file_url(data)) + data["name"] = text.unquote( + text.filename_from_url(self.get_file_url(data)) ) data["extension"] = os.path.splitext(data["name"])[1][1:] return data diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py index 2f943068..2d2b6fb4 100644 --- a/gallery_dl/extractor/chan.py +++ b/gallery_dl/extractor/chan.py @@ -9,6 +9,7 @@ """Base classes for extractors for different Futaba Channel boards""" from .common import SequentialExtractor, Message +from .. import text import re class ChanExtractor(SequentialExtractor): @@ -44,5 +45,4 @@ class ChanExtractor(SequentialExtractor): """Return thread title from first post""" if "sub" in post: return post["sub"] - com = re.sub("<[^>]+?>", "", post["com"]) - return " ".join(com.split())[:50] + return text.remove_html(post["com"])[:50] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index cb8e91ca..b364d870 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -44,24 +44,6 @@ class Extractor(): "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0" ) - @staticmethod - def extract(txt, begin, end, pos=0): - try: - first = txt.index(begin, pos) + len(begin) - last = txt.index(end, first) - return txt[first:last], last+len(end) - except ValueError: - return None, pos - - @staticmethod - def extract_all(txt, begin, end, pos=0): - try: - first = txt.index(begin, pos) - last = txt.index(end, first + len(begin)) + len(end) - return txt[first:last], last - except ValueError: - return None, pos - class SequentialExtractor(Extractor): @@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs): # everything ok -- proceed to download return r - -def filename_from_url(url): - pos = url.rfind("/") - return url[pos+1:] - -unescape = html.parser.HTMLParser().unescape diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index f8886a7a..c89721f2 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -8,9 +8,8 @@ """Extract images from galleries at http://www.imagebam.com/""" -from .common import AsynchronousExtractor -from .common import Message -from .common import filename_from_url +from .common import AsynchronousExtractor, Message +from .. import text info = { "category": "imagebam", @@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor): done = False while not done: # get current page - text = self.request(self.url_base + next_url).text + page = self.request(self.url_base + next_url).text # get url for next page - next_url, pos = self.extract(text, "next image" we are done - if not text.startswith(">next image", pos): + if not page.startswith(">next image", pos): done = True # get image url - img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos) + img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos) yield Message.Url, img_url, self.get_file_metadata(img_url) def get_job_metadata(self): """Collect metadata for extractor-job""" gallery_key = self.match.group(2) - text = self.request(self.url_base + "/gallery/" + gallery_key).text - _ , pos = self.extract(text, " ", " <", pos) - count, pos = self.extract(text, "'>", " images", pos) - url , pos = self.extract(text, " ", " <", pos) + count, pos = text.extract(page, "'>", " images", pos) + url , pos = text.extract(page, "', page):
-            text = self.request(self.url_base + match.group(1)).text
-            yield Message.Url, self.get_file_url(text), self.get_file_metadata(text)
+            imgpage = self.request(self.url_base + match.group(1)).text
+            yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)
 
     def get_job_metadata(self, page):
           ', ' of ') - data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos) - data["name"] , pos = self.extract(text, ' title="', '"', pos) + data["num"] , pos = text.extract(page, '   ', ' of ') + data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos) + data["name"] , pos = text.extract(page, ' title="', '"', pos) return data - def get_file_url(self, text): + def get_file_url(self, page): """Extract download-url""" base = "http://i.imgbox.com/" - path, _ = self.extract(text, base, '"') + path, _ = text.extract(page, base, '"') return base + path diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py index 40932912..9e591e57 100644 --- a/gallery_dl/extractor/imgchili.py +++ b/gallery_dl/extractor/imgchili.py @@ -8,9 +8,8 @@ """Extract images from albums at http://imgchili.net/""" -from .common import SequentialExtractor -from .common import Message -from .common import filename_from_url +from .common import SequentialExtractor, Message +from .. import text import re info = { @@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor): def get_job_metadata(self, page): """Collect metadata for extractor-job""" - title = self.extract(page, "

", "

")[0] + title = text.extract(page, "

", "

")[0] return { "category": info["category"], "title": title, diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 62575308..60ed473a 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -8,10 +8,8 @@ """Extract manga pages from http://www.mangareader.net/""" -from .common import AsynchronousExtractor -from .common import Message -from .common import unescape, filename_from_url -from urllib.parse import unquote +from .common import AsynchronousExtractor, Message +from .. import text import os.path import re @@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor): def get_page_metadata(self, page_url): """Collect next url, image-url and metadata for one manga-page""" page = self.request(page_url).text - extr = self.extract + extr = text.extract width = None descr, pos = extr(page, '', '') - manga , pos = self.extract(page, 'title="', '"', pos) - chapter , pos = self.extract(page, '">', '', pos) - json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos) + _ , pos = text.extract(page, '

', '') + manga , pos = text.extract(page, 'title="', '"', pos) + chapter , pos = text.extract(page, '">', '', pos) + json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos) match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter) return { "category": info["category"], - "manga": unescape(manga), + "manga": text.unescape(manga), "chapter": match.group(2) or match.group(1), "chapter-minor": match.group(3) or "", "language": "English", - "title": unescape(match.group(4) or ""), + "title": text.unescape(match.group(4) or ""), }, json.loads(json_data) From 9986a5ffb50f3ffc5e9d91f6f8fe36fdf278d131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Oct 2015 20:23:55 +0200 Subject: [PATCH 10/15] json-based config module --- gallery_dl/config.py | 90 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 gallery_dl/config.py diff --git a/gallery_dl/config.py b/gallery_dl/config.py new file mode 100644 index 00000000..549e40e9 --- /dev/null +++ b/gallery_dl/config.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Global configuration module""" + +import sys +import json +import os.path +import platform + +# -------------------------------------------------------------------- +# public interface + +def load(*files): + """Load JSON configuration files""" + configfiles = files or _default_configs + for conf in configfiles: + try: + path = os.path.expanduser(conf) + with open(path) as file: + confdict = json.load(file) + _config.update(confdict) + except FileNotFoundError: + continue + except json.decoder.JSONDecodeError as exception: + print("Error while loading '", path, "':", sep="", file=sys.stderr) + print(exception, file=sys.stderr) + +def clear(): + """Reset configuration to en empty state""" + globals()["_config"] = {} + +def get(key, default=None): + """Get the value of property 'key' or a default-value if it doenst exist""" + conf = _config + try: + for k in key.split("."): + conf = conf[k] + return conf + except (KeyError, AttributeError): + return default + +def interpolate(key, default=None): + """Interpolate the value of 'key'""" + conf = _config + keys = key.split(".") + try: + for k in keys: + default = conf.get(keys[-1], default) + conf = conf[k] + return conf + except (KeyError, AttributeError): + return default + +def set(key, value): + """Set the value of property 'key' for this session""" + conf = _config + keys = key.split(".") + for k in keys[:-1]: + try: + conf = conf[k] + except KeyError: + temp = {} + conf[k] = temp + conf = temp + conf[keys[-1]] = value + + +# -------------------------------------------------------------------- +# internals + +_config = {} + +if platform.system() == "Windows": + _default_configs = [ + r"~\.config\gallery-dl.conf", + r"~\.gallery-dl.conf", + ] +else: + _default_configs = [ + "/etc/gallery-dl.conf", + "~/.config/gallery/config.json", + "~/.config/gallery-dl.conf", + "~/.gallery-dl.conf", + ] From 7ac106096f8da32aa20bc2c4f18730d88345786a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 3 Oct 2015 20:24:28 +0200 Subject: [PATCH 11/15] add tests for config-module --- test/test_config.py | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 test/test_config.py diff --git a/test/test_config.py b/test/test_config.py new file mode 100644 index 00000000..3aaeb42c --- /dev/null +++ b/test/test_config.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +import gallery_dl.config as config +import os +import tempfile + +class TestConfig(unittest.TestCase): + + def setUp(self): + fd, self._configfile = tempfile.mkstemp() + with os.fdopen(fd, "w") as file: + file.write('{"a": "1", "b": {"c": "text"}}') + config.load(self._configfile) + + def tearDown(self): + config.clear() + os.remove(self._configfile) + + def test_get(self): + self.assertEqual(config.get("a"), "1") + self.assertEqual(config.get("b.c"), "text") + self.assertEqual(config.get("d"), None) + self.assertEqual(config.get("e.f.g", 123), 123) + + def test_set(self): + config.set("b.c", [1, 2, 3]) + config.set("e.f.g", 234) + self.assertEqual(config.get("b.c"), [1, 2, 3]) + self.assertEqual(config.get("e.f.g"), 234) + + def test_interpolate(self): + self.assertEqual(config.interpolate("a"), "1") + self.assertEqual(config.interpolate("b.a"), "1") + self.assertEqual(config.interpolate("b.c", "2"), "text") + self.assertEqual(config.interpolate("b.d", "2"), "2") + config.set("d", 123) + self.assertEqual(config.interpolate("b.d", "2"), 123) + self.assertEqual(config.interpolate("d.d", "2"), 123) + +if __name__ == '__main__': + unittest.main() From 2026223ed10d1461ca7327b7b8aaa347b72c5b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 5 Oct 2015 12:42:42 +0200 Subject: [PATCH 12/15] change argument format for config-calls --- gallery_dl/config.py | 10 ++++------ test/test_config.py | 30 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/gallery_dl/config.py b/gallery_dl/config.py index 549e40e9..9a02f307 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -35,20 +35,19 @@ def clear(): """Reset configuration to en empty state""" globals()["_config"] = {} -def get(key, default=None): +def get(keys, default=None): """Get the value of property 'key' or a default-value if it doenst exist""" conf = _config try: - for k in key.split("."): + for k in keys: conf = conf[k] return conf except (KeyError, AttributeError): return default -def interpolate(key, default=None): +def interpolate(keys, default=None): """Interpolate the value of 'key'""" conf = _config - keys = key.split(".") try: for k in keys: default = conf.get(keys[-1], default) @@ -57,10 +56,9 @@ def interpolate(key, default=None): except (KeyError, AttributeError): return default -def set(key, value): +def set(keys, value): """Set the value of property 'key' for this session""" conf = _config - keys = key.split(".") for k in keys[:-1]: try: conf = conf[k] diff --git a/test/test_config.py b/test/test_config.py index 3aaeb42c..f8017626 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -25,25 +25,25 @@ class TestConfig(unittest.TestCase): os.remove(self._configfile) def test_get(self): - self.assertEqual(config.get("a"), "1") - self.assertEqual(config.get("b.c"), "text") - self.assertEqual(config.get("d"), None) - self.assertEqual(config.get("e.f.g", 123), 123) + self.assertEqual(config.get(["a"]), "1") + self.assertEqual(config.get(["b", "c"]), "text") + self.assertEqual(config.get(["d"]), None) + self.assertEqual(config.get(["e", "f", "g"], 123), 123) def test_set(self): - config.set("b.c", [1, 2, 3]) - config.set("e.f.g", 234) - self.assertEqual(config.get("b.c"), [1, 2, 3]) - self.assertEqual(config.get("e.f.g"), 234) + config.set(["b", "c"], [1, 2, 3]) + config.set(["e", "f", "g"], value=234) + self.assertEqual(config.get(["b", "c"]), [1, 2, 3]) + self.assertEqual(config.get(["e", "f", "g"]), 234) def test_interpolate(self): - self.assertEqual(config.interpolate("a"), "1") - self.assertEqual(config.interpolate("b.a"), "1") - self.assertEqual(config.interpolate("b.c", "2"), "text") - self.assertEqual(config.interpolate("b.d", "2"), "2") - config.set("d", 123) - self.assertEqual(config.interpolate("b.d", "2"), 123) - self.assertEqual(config.interpolate("d.d", "2"), 123) + self.assertEqual(config.interpolate(["a"]), "1") + self.assertEqual(config.interpolate(["b", "a"]), "1") + self.assertEqual(config.interpolate(["b", "c"], "2"), "text") + self.assertEqual(config.interpolate(["b", "d"], "2"), "2") + config.set(["d"], 123) + self.assertEqual(config.interpolate(["b", "d"], "2"), 123) + self.assertEqual(config.interpolate(["d", "d"], "2"), 123) if __name__ == '__main__': unittest.main() From 608d3193a9f8137e4d5f116251dde23aa8e358c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 5 Oct 2015 13:26:38 +0200 Subject: [PATCH 13/15] use new config-module in downloader --- gallery_dl/__init__.py | 16 +++------------- gallery_dl/download.py | 39 +++++++++++++++++---------------------- 2 files changed, 20 insertions(+), 35 deletions(-) diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index aed11666..b0cebaed 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -17,9 +17,7 @@ __email__ = "mike_faehrmann@web.de" import os import sys import argparse -import configparser - -from .download import DownloadManager +from . import config, download def parse_cmdline_options(): parser = argparse.ArgumentParser( @@ -41,18 +39,10 @@ def parse_cmdline_options(): ) return parser.parse_args() -def parse_config_file(path): - config = configparser.ConfigParser( - interpolation=None, - ) - config.optionxform = lambda opt: opt - config.read(os.path.expanduser(path)) - return config - def main(): + config.load() opts = parse_cmdline_options() - conf = parse_config_file(opts.config) - dlmgr = DownloadManager(opts, conf) + dlmgr = download.DownloadManager(opts) try: for url in opts.urls: diff --git a/gallery_dl/download.py b/gallery_dl/download.py index 96ababa5..7fdfacfd 100644 --- a/gallery_dl/download.py +++ b/gallery_dl/download.py @@ -12,14 +12,14 @@ import re import importlib from .extractor.common import Message +from . import config class DownloadManager(): - def __init__(self, opts, config): + def __init__(self, opts): self.opts = opts - self.config = config self.modules = {} - self.extractors = ExtractorFinder(config) + self.extractors = ExtractorFinder() def add(self, url): job = DownloadJob(self, url) @@ -38,7 +38,7 @@ class DownloadManager(): if self.opts.dest: return self.opts.dest else: - return self.config.get("general", "destination", fallback="/tmp/") + return config.get(("base-directory",), default="/tmp/") class DownloadJob(): @@ -50,16 +50,14 @@ class DownloadJob(): return self.directory = mngr.get_base_directory() self.downloaders = {} - self.filename_fmt = mngr.config.get( - self.info["category"], "filename", - fallback=self.info["filename"] + self.filename_fmt = config.get( + ("extractor", self.info["category"], "filename"), + default=self.info["filename"] + ) + segments = config.get( + ("extractor", self.info["category"], "directory"), + default=self.info["directory"] ) - try: - segments = mngr.config.get( - self.info["category"], "directory" - ).split("/") - except Exception: - segments = self.info["directory"] self.directory_fmt = os.path.join(*segments) def run(self): @@ -144,26 +142,23 @@ class DownloadJob(): class ExtractorFinder(): - def __init__(self, config): - self.config = config - def get_for_url(self, url): """Get an extractor-instance suitable for 'url'""" name, match = self.find_pattern_match(url) if match: module = importlib.import_module(".extractor." + name, __package__) klass = getattr(module, module.info["extractor"]) - return klass(match, self.config), module.info + return klass(match, {}), module.info else: print("no suitable extractor found") return None, None def find_pattern_match(self, url): - """Find a pattern, that matches 'url', and return the (category,match) tuple""" - for category in self.config: - for key, value in self.config[category].items(): - if key.startswith("regex"): - match = re.match(value, url) + """Find a pattern that matches 'url' and return the (category,match) tuple""" + for category in config.get(("extractor",)): + patterns = config.get(("extractor", category, "pattern"), default=[]) + for pattern in patterns: + match = re.match(pattern, url) if match: return category, match for category, info in self.extractor_metadata(): From 3c13548f29502398b1cf785ecc44c3df57a696a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 5 Oct 2015 15:35:48 +0200 Subject: [PATCH 14/15] rewrite extractors to use config-module --- gallery_dl/download.py | 10 ++++------ gallery_dl/extractor/3dbooru.py | 4 ++-- gallery_dl/extractor/4chan.py | 4 ++-- gallery_dl/extractor/8chan.py | 4 ++-- gallery_dl/extractor/batoto.py | 4 ++-- gallery_dl/extractor/booru.py | 4 ++-- gallery_dl/extractor/chan.py | 5 ++--- gallery_dl/extractor/common.py | 8 ++++---- gallery_dl/extractor/danbooru.py | 4 ++-- gallery_dl/extractor/e621.py | 4 ++-- gallery_dl/extractor/gelbooru.py | 4 ++-- gallery_dl/extractor/imagebam.py | 4 ++-- gallery_dl/extractor/imgbox.py | 4 ++-- gallery_dl/extractor/imgchili.py | 4 ++-- gallery_dl/extractor/mangareader.py | 4 ++-- gallery_dl/extractor/nijie.py | 20 +++++++++++--------- gallery_dl/extractor/pixiv.py | 11 +++++------ gallery_dl/extractor/redhawkscans.py | 4 ++-- gallery_dl/extractor/yandere.py | 4 ++-- 19 files changed, 54 insertions(+), 56 deletions(-) diff --git a/gallery_dl/download.py b/gallery_dl/download.py index 7fdfacfd..f1ba96d6 100644 --- a/gallery_dl/download.py +++ b/gallery_dl/download.py @@ -112,13 +112,11 @@ class DownloadJob(): scheme = url[:pos] if pos != -1 else "http" if scheme == "https": scheme = "http" - downloader = self.downloaders.get(scheme) if downloader is None: module = self.mngr.get_downloader_module(scheme) downloader = module.Downloader() self.downloaders[scheme] = downloader - return downloader @staticmethod @@ -148,7 +146,7 @@ class ExtractorFinder(): if match: module = importlib.import_module(".extractor." + name, __package__) klass = getattr(module, module.info["extractor"]) - return klass(match, {}), module.info + return klass(match), module.info else: print("no suitable extractor found") return None, None @@ -158,9 +156,9 @@ class ExtractorFinder(): for category in config.get(("extractor",)): patterns = config.get(("extractor", category, "pattern"), default=[]) for pattern in patterns: - match = re.match(pattern, url) - if match: - return category, match + match = re.match(pattern, url) + if match: + return category, match for category, info in self.extractor_metadata(): for pattern in info["pattern"]: match = re.match(pattern, url) diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index a17b954c..665c1e01 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -22,8 +22,8 @@ info = { class ThreeDeeBooruExtractor(JSONBooruExtractor): - def __init__(self, match, config): - JSONBooruExtractor.__init__(self, match, config, info) + def __init__(self, match): + JSONBooruExtractor.__init__(self, match, info) self.api_url = "http://behoimi.org/post/index.json" self.headers = { "Referer": "http://behoimi.org/post/show/", diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py index 028ab7de..9aab90a2 100644 --- a/gallery_dl/extractor/4chan.py +++ b/gallery_dl/extractor/4chan.py @@ -25,8 +25,8 @@ class FourChanExtractor(ChanExtractor): api_url = "https://a.4cdn.org/{board}/thread/{thread}.json" file_url = "https://i.4cdn.org/{board}/{tim}{ext}" - def __init__(self, match, config): + def __init__(self, match): ChanExtractor.__init__( - self, config, info["category"], + self, info["category"], match.group(1), match.group(2) ) diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index 559951fa..43d34de5 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -25,8 +25,8 @@ class InfinityChanExtractor(ChanExtractor): api_url = "https://8ch.net/{board}/res/{thread}.json" file_url = "https://media.8ch.net/{board}/src/{tim}{ext}" - def __init__(self, match, config): + def __init__(self, match): ChanExtractor.__init__( - self, config, info["category"], + self, info["category"], match.group(1), match.group(2) ) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 65bc7c3d..640df8ac 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -27,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor): url_base = "http://bato.to/read/_/" - def __init__(self, match, config): - AsynchronousExtractor.__init__(self, config) + def __init__(self, match): + AsynchronousExtractor.__init__(self) self.chapter_id = match.group(1) def items(self): diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index f72bc789..14629fd6 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -19,8 +19,8 @@ class BooruExtractor(SequentialExtractor): api_url = "" - def __init__(self, match, config, info): - SequentialExtractor.__init__(self, config) + def __init__(self, match, info): + SequentialExtractor.__init__(self) self.info = info self.tags = text.unquote(match.group(1)) self.page = "page" diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py index 2d2b6fb4..c6389314 100644 --- a/gallery_dl/extractor/chan.py +++ b/gallery_dl/extractor/chan.py @@ -10,15 +10,14 @@ from .common import SequentialExtractor, Message from .. import text -import re class ChanExtractor(SequentialExtractor): api_url = "" file_url = "" - def __init__(self, config, category, board, thread): - SequentialExtractor.__init__(self, config) + def __init__(self, category, board, thread): + SequentialExtractor.__init__(self) self.metadata = { "category": category, "board": board, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index b364d870..4d5b96a9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -12,7 +12,7 @@ import time import queue import requests import threading -import html.parser +from .. import config class Message(): @@ -47,15 +47,15 @@ class Extractor(): class SequentialExtractor(Extractor): - def __init__(self, _): + def __init__(self): Extractor.__init__(self) class AsynchronousExtractor(Extractor): - def __init__(self, config): + def __init__(self): Extractor.__init__(self) - queue_size = int(config.get("general", "queue-size", fallback=5)) + queue_size = int(config.get(("queue-size",), default=5)) self.__queue = queue.Queue(maxsize=queue_size) self.__thread = threading.Thread(target=self.async_items, daemon=True) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 3e94cd65..5024020f 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -22,6 +22,6 @@ info = { class DanbooruExtractor(JSONBooruExtractor): - def __init__(self, match, config): - JSONBooruExtractor.__init__(self, match, config, info) + def __init__(self, match): + JSONBooruExtractor.__init__(self, match, info) self.api_url = "https://danbooru.donmai.us/posts.json" diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 3851e447..af4971e8 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -23,6 +23,6 @@ info = { class E621Extractor(JSONBooruExtractor): - def __init__(self, match, config): - JSONBooruExtractor.__init__(self, match, config, info) + def __init__(self, match): + JSONBooruExtractor.__init__(self, match, info) self.api_url = "https://e621.net/post/index.json" diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index a95ed82e..87244904 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -22,8 +22,8 @@ info = { class GelbooruExtractor(XMLBooruExtractor): - def __init__(self, match, config): - XMLBooruExtractor.__init__(self, match, config, info) + def __init__(self, match): + XMLBooruExtractor.__init__(self, match, info) self.api_url = "http://gelbooru.com/" self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags} diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index c89721f2..809eaa1a 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -25,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor): url_base = "http://www.imagebam.com" - def __init__(self, match, config): - AsynchronousExtractor.__init__(self, config) + def __init__(self, match): + AsynchronousExtractor.__init__(self) self.match = match self.num = 0 self.metadata = {} diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index 3de51f27..f466c96a 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -26,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor): url_base = "http://imgbox.com" - def __init__(self, match, config): - AsynchronousExtractor.__init__(self, config) + def __init__(self, match): + AsynchronousExtractor.__init__(self) self.key = match.group(1) self.metadata = {} diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py index 9e591e57..8d164764 100644 --- a/gallery_dl/extractor/imgchili.py +++ b/gallery_dl/extractor/imgchili.py @@ -24,8 +24,8 @@ info = { class ImgchiliExtractor(SequentialExtractor): - def __init__(self, match, config): - SequentialExtractor.__init__(self, config) + def __init__(self, match): + SequentialExtractor.__init__(self) self.match = match self.num = 0 diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 60ed473a..57fd3efc 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -28,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor): url_base = "http://www.mangareader.net" - def __init__(self, match, config): - AsynchronousExtractor.__init__(self, config) + def __init__(self, match): + AsynchronousExtractor.__init__(self) self.part = match.group(1) def items(self): diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index afeefd60..7c309fbf 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -9,7 +9,7 @@ """Extract images from https://nijie.info/""" from .common import AsynchronousExtractor, Message -from ..text import filename_from_url +from .. import config, text import re info = { @@ -26,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor): popup_url = "https://nijie.info/view_popup.php?id=" - def __init__(self, match, config): - AsynchronousExtractor.__init__(self, config) + def __init__(self, match): + AsynchronousExtractor.__init__(self) self.artist_id = match.group(1) self.artist_url = ( "https://nijie.info/members_illust.php?id=" @@ -36,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor): self.session.headers["Referer"] = self.artist_url self.session.cookies["R18"] = "1" self.session.cookies["nijie_referer"] = "nijie.info" - self.session.cookies.update(config["nijie-cookies"]) + self.session.cookies.update( + config.get(("extractor", info["category"], "cookies")) + ) def items(self): data = self.get_job_metadata() @@ -56,19 +58,19 @@ class NijieExtractor(AsynchronousExtractor): def get_image_ids(self): """Collect all image-ids for a specific artist""" - text = self.request(self.artist_url).text + page = self.request(self.artist_url).text regex = r' Date: Mon, 5 Oct 2015 15:55:11 +0200 Subject: [PATCH 15/15] change example-config to json --- config | 18 ------------------ config.json | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 18 deletions(-) delete mode 100644 config create mode 100644 config.json diff --git a/config b/config deleted file mode 100644 index dec8b374..00000000 --- a/config +++ /dev/null @@ -1,18 +0,0 @@ -[pixiv] -username = XXXXX -password = XXXXX - -[exhentai-cookies] -ipb_member_id = XXXXX -ipb_pass_hash = XXXXX - -[nijie-cookies] -NIJIEIJIEID = XXXXX -nijie_email_hash = XXXXX -nijie_login_hash = XXXXX - -[danbooru] -regex0 = d(?:anbooru)?[.:-_](\w.+) - -[gelbooru] -regex0 = g(?:elbooru)?[.:-_](\w.+) diff --git a/config.json b/config.json new file mode 100644 index 00000000..deba0cef --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "base-directory": "/tmp/", + "extractor": + { + "pixiv": + { + "directory": ["{category}", "{artist-id}"], + "username": "XXX", + "password": "XXX" + }, + "nijie": + { + "cookies": + { + "NIJIEIJIEID": "XXX", + "nijie_email_hash": "XXX", + "nijie_login_hash": "XXX" + } + }, + "4chan": + { + "directory": ["{category}", "{board}", "{thread} - {title}"] + }, + "danbooru": + { + "pattern": ["d(?:anbooru)?[.:-_](\\w.+)"], + "filename": "{category}_{id:>07}_{md5}.{extension}" + }, + "gelbooru": + { + "pattern": ["g(?:elbooru)?[.:-_](\\w.+)"], + "filename": "{category}_{id:>07}_{md5}.{extension}" + }, + "e621": + { + "pattern": ["e(?:621)?[.:-_](\\w.+)"] + } + } +}