(.+) - (?:vol (\d+) )?"
r"ch (\d+)[^ ]+ Page (\d+) | Batoto!",
@@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor):
r"(.+) - ([^ ]+)",
trans
)
- filename = unquote(filename_from_url(image))
+ filename = text.unquote(text.filename_from_url(image))
name, ext = os.path.splitext(filename)
return url, {
"category": info["category"],
"chapter-id": self.chapter_id,
- "manga": unescape(mmatch.group(1)),
+ "manga": text.unescape(mmatch.group(1)),
"volume": mmatch.group(2) or "",
"chapter": mmatch.group(3),
"page": mmatch.group(4),
"group": tmatch.group(1),
"language": tmatch.group(2),
- "title": unescape(title),
+ "title": text.unescape(title),
"image-url": image,
"name": name,
"extension": ext[1:],
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 88600397..14629fd6 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -8,23 +8,21 @@
"""Base classes for extractors for danbooru and co"""
-from .common import SequentialExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import SequentialExtractor, Message
+from .. import text
import xml.etree.ElementTree as ET
import json
import os.path
import urllib.parse
-
class BooruExtractor(SequentialExtractor):
api_url = ""
- def __init__(self, match, config, info):
- SequentialExtractor.__init__(self, config)
+ def __init__(self, match, info):
+ SequentialExtractor.__init__(self)
self.info = info
- self.tags = urllib.parse.unquote(match.group(1))
+ self.tags = text.unquote(match.group(1))
self.page = "page"
self.params = {"tags": self.tags}
self.headers = {}
@@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor):
def get_file_metadata(self, data):
"""Collect metadata for a downloadable file"""
data["category"] = self.info["category"]
- data["name"] = urllib.parse.unquote(
- filename_from_url(self.get_file_url(data))
+ data["name"] = text.unquote(
+ text.filename_from_url(self.get_file_url(data))
)
data["extension"] = os.path.splitext(data["name"])[1][1:]
return data
diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py
new file mode 100644
index 00000000..c6389314
--- /dev/null
+++ b/gallery_dl/extractor/chan.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Base classes for extractors for different Futaba Channel boards"""
+
+from .common import SequentialExtractor, Message
+from .. import text
+
+class ChanExtractor(SequentialExtractor):
+
+ api_url = ""
+ file_url = ""
+
+ def __init__(self, category, board, thread):
+ SequentialExtractor.__init__(self)
+ self.metadata = {
+ "category": category,
+ "board": board,
+ "thread": thread,
+ }
+
+ def items(self):
+ yield Message.Version, 1
+ posts = self.request(self.api_url.format(**self.metadata)).json()["posts"]
+ self.metadata["title"] = self.get_thread_title(posts[0])
+ yield Message.Directory, self.metadata
+ for post in posts:
+ if "filename" not in post:
+ continue
+ post.update(self.metadata)
+ yield Message.Url, self.file_url.format(**post), post
+ if "extra_files" in post:
+ for file in post["extra_files"]:
+ post.update(file)
+ yield Message.Url, self.file_url.format(**post), post
+
+ @staticmethod
+ def get_thread_title(post):
+ """Return thread title from first post"""
+ if "sub" in post:
+ return post["sub"]
+ return text.remove_html(post["com"])[:50]
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index cb8e91ca..4d5b96a9 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -12,7 +12,7 @@ import time
import queue
import requests
import threading
-import html.parser
+from .. import config
class Message():
@@ -44,36 +44,18 @@ class Extractor():
"Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
)
- @staticmethod
- def extract(txt, begin, end, pos=0):
- try:
- first = txt.index(begin, pos) + len(begin)
- last = txt.index(end, first)
- return txt[first:last], last+len(end)
- except ValueError:
- return None, pos
-
- @staticmethod
- def extract_all(txt, begin, end, pos=0):
- try:
- first = txt.index(begin, pos)
- last = txt.index(end, first + len(begin)) + len(end)
- return txt[first:last], last
- except ValueError:
- return None, pos
-
class SequentialExtractor(Extractor):
- def __init__(self, _):
+ def __init__(self):
Extractor.__init__(self)
class AsynchronousExtractor(Extractor):
- def __init__(self, config):
+ def __init__(self):
Extractor.__init__(self)
- queue_size = int(config.get("general", "queue-size", fallback=5))
+ queue_size = int(config.get(("queue-size",), default=5))
self.__queue = queue.Queue(maxsize=queue_size)
self.__thread = threading.Thread(target=self.async_items, daemon=True)
@@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs):
# everything ok -- proceed to download
return r
-
-def filename_from_url(url):
- pos = url.rfind("/")
- return url[pos+1:]
-
-unescape = html.parser.HTMLParser().unescape
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 3e94cd65..5024020f 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -22,6 +22,6 @@ info = {
class DanbooruExtractor(JSONBooruExtractor):
- def __init__(self, match, config):
- JSONBooruExtractor.__init__(self, match, config, info)
+ def __init__(self, match):
+ JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://danbooru.donmai.us/posts.json"
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 3851e447..af4971e8 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -23,6 +23,6 @@ info = {
class E621Extractor(JSONBooruExtractor):
- def __init__(self, match, config):
- JSONBooruExtractor.__init__(self, match, config, info)
+ def __init__(self, match):
+ JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://e621.net/post/index.json"
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index a95ed82e..87244904 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -22,8 +22,8 @@ info = {
class GelbooruExtractor(XMLBooruExtractor):
- def __init__(self, match, config):
- XMLBooruExtractor.__init__(self, match, config, info)
+ def __init__(self, match):
+ XMLBooruExtractor.__init__(self, match, info)
self.api_url = "http://gelbooru.com/"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
index f8886a7a..809eaa1a 100644
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -8,9 +8,8 @@
"""Extract images from galleries at http://www.imagebam.com/"""
-from .common import AsynchronousExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import AsynchronousExtractor, Message
+from .. import text
info = {
"category": "imagebam",
@@ -26,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):
url_base = "http://www.imagebam.com"
- def __init__(self, match, config):
- AsynchronousExtractor.__init__(self, config)
+ def __init__(self, match):
+ AsynchronousExtractor.__init__(self)
self.match = match
self.num = 0
self.metadata = {}
@@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor):
done = False
while not done:
# get current page
- text = self.request(self.url_base + next_url).text
+ page = self.request(self.url_base + next_url).text
# get url for next page
- next_url, pos = self.extract(text, "
next image" we are done
- if not text.startswith(">next image", pos):
+ if not page.startswith(">next image", pos):
done = True
# get image url
- img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos)
+ img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)
yield Message.Url, img_url, self.get_file_metadata(img_url)
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
gallery_key = self.match.group(2)
- text = self.request(self.url_base + "/gallery/" + gallery_key).text
- _ , pos = self.extract(text, " ", " <", pos)
- count, pos = self.extract(text, "'>", " images", pos)
- url , pos = self.extract(text, " ", " <", pos)
+ count, pos = text.extract(page, "'>", " images", pos)
+ url , pos = text.extract(page, " ', ' of ')
- data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos)
- data["name"] , pos = self.extract(text, ' title="', '"', pos)
+ data["num"] , pos = text.extract(page, ' ', ' of ')
+ data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos)
+ data["name"] , pos = text.extract(page, ' title="', '"', pos)
return data
- def get_file_url(self, text):
+ def get_file_url(self, page):
"""Extract download-url"""
base = "http://i.imgbox.com/"
- path, _ = self.extract(text, base, '"')
+ path, _ = text.extract(page, base, '"')
return base + path
diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py
index 40932912..8d164764 100644
--- a/gallery_dl/extractor/imgchili.py
+++ b/gallery_dl/extractor/imgchili.py
@@ -8,9 +8,8 @@
"""Extract images from albums at http://imgchili.net/"""
-from .common import SequentialExtractor
-from .common import Message
-from .common import filename_from_url
+from .common import SequentialExtractor, Message
+from .. import text
import re
info = {
@@ -25,8 +24,8 @@ info = {
class ImgchiliExtractor(SequentialExtractor):
- def __init__(self, match, config):
- SequentialExtractor.__init__(self, config)
+ def __init__(self, match):
+ SequentialExtractor.__init__(self)
self.match = match
self.num = 0
@@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
- title = self.extract(page, "", "
")[0]
+ title = text.extract(page, "", "
")[0]
return {
"category": info["category"],
"title": title,
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
index 62575308..57fd3efc 100644
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@@ -8,10 +8,8 @@
"""Extract manga pages from http://www.mangareader.net/"""
-from .common import AsynchronousExtractor
-from .common import Message
-from .common import unescape, filename_from_url
-from urllib.parse import unquote
+from .common import AsynchronousExtractor, Message
+from .. import text
import os.path
import re
@@ -30,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):
url_base = "http://www.mangareader.net"
- def __init__(self, match, config):
- AsynchronousExtractor.__init__(self, config)
+ def __init__(self, match):
+ AsynchronousExtractor.__init__(self)
self.part = match.group(1)
def items(self):
@@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor):
def get_page_metadata(self, page_url):
"""Collect next url, image-url and metadata for one manga-page"""
page = self.request(page_url).text
- extr = self.extract
+ extr = text.extract
width = None
descr, pos = extr(page, '', '')
- manga , pos = self.extract(page, 'title="', '"', pos)
- chapter , pos = self.extract(page, '">', '', pos)
- json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos)
+ _ , pos = text.extract(page, '
', '')
+ manga , pos = text.extract(page, 'title="', '"', pos)
+ chapter , pos = text.extract(page, '">', '', pos)
+ json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos)
match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
return {
"category": info["category"],
- "manga": unescape(manga),
+ "manga": text.unescape(manga),
"chapter": match.group(2) or match.group(1),
"chapter-minor": match.group(3) or "",
"language": "English",
- "title": unescape(match.group(4) or ""),
+ "title": text.unescape(match.group(4) or ""),
}, json.loads(json_data)
diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py
index 2e574b11..95843176 100644
--- a/gallery_dl/extractor/yandere.py
+++ b/gallery_dl/extractor/yandere.py
@@ -22,6 +22,6 @@ info = {
class YandereExtractor(JSONBooruExtractor):
- def __init__(self, match, config):
- JSONBooruExtractor.__init__(self, match, config, info)
+ def __init__(self, match):
+ JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://yande.re/post.json"
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
new file mode 100644
index 00000000..47fd7258
--- /dev/null
+++ b/gallery_dl/text.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of functions that work in strings/text"""
+
+import re
+import html.parser
+import urllib.parse
+import platform
+
+def remove_html(text):
+ """Remove html-tags from a string"""
+ return " ".join(re.sub("<[^>]+?>", " ", text).split())
+
+def filename_from_url(url):
+ """Extract the last part of an url to use as a filename"""
+ try:
+ path = urllib.parse.urlparse(url).path
+ pos = path.rindex("/")
+ return path[pos+1:]
+ except ValueError:
+ return url
+
+def clean_path_windows(path):
+ """Remove illegal characters from a path-segment (Windows)"""
+ return re.sub(r'[<>:"\\/|?*]', "_", path)
+
+def clean_path_posix(path):
+ """Remove illegal characters from a path-segment (Posix)"""
+ return path.replace("/", "_")
+
+def extract(txt, begin, end, pos=0):
+ try:
+ first = txt.index(begin, pos) + len(begin)
+ last = txt.index(end, first)
+ return txt[first:last], last+len(end)
+ except ValueError:
+ return None, pos
+
+def extract_all(txt, begin, end, pos=0):
+ try:
+ first = txt.index(begin, pos)
+ last = txt.index(end, first + len(begin)) + len(end)
+ return txt[first:last], last
+ except ValueError:
+ return None, pos
+
+if platform.system() == "Windows":
+ clean_path = clean_path_windows
+else:
+ clean_path = clean_path_posix
+
+unquote = urllib.parse.unquote
+
+unescape = html.parser.HTMLParser().unescape
diff --git a/setup.py b/setup.py
index db9adead..fe2d134e 100644
--- a/setup.py
+++ b/setup.py
@@ -46,4 +46,5 @@ setup(
"Topic :: Multimedia",
"Topic :: Multimedia :: Graphics",
],
+ test_suite='test',
)
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/test/test_config.py b/test/test_config.py
new file mode 100644
index 00000000..f8017626
--- /dev/null
+++ b/test/test_config.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import gallery_dl.config as config
+import os
+import tempfile
+
+class TestConfig(unittest.TestCase):
+
+ def setUp(self):
+ fd, self._configfile = tempfile.mkstemp()
+ with os.fdopen(fd, "w") as file:
+ file.write('{"a": "1", "b": {"c": "text"}}')
+ config.load(self._configfile)
+
+ def tearDown(self):
+ config.clear()
+ os.remove(self._configfile)
+
+ def test_get(self):
+ self.assertEqual(config.get(["a"]), "1")
+ self.assertEqual(config.get(["b", "c"]), "text")
+ self.assertEqual(config.get(["d"]), None)
+ self.assertEqual(config.get(["e", "f", "g"], 123), 123)
+
+ def test_set(self):
+ config.set(["b", "c"], [1, 2, 3])
+ config.set(["e", "f", "g"], value=234)
+ self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
+ self.assertEqual(config.get(["e", "f", "g"]), 234)
+
+ def test_interpolate(self):
+ self.assertEqual(config.interpolate(["a"]), "1")
+ self.assertEqual(config.interpolate(["b", "a"]), "1")
+ self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
+ self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
+ config.set(["d"], 123)
+ self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
+ self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_text.py b/test/test_text.py
new file mode 100644
index 00000000..91e0097e
--- /dev/null
+++ b/test/test_text.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import gallery_dl.text as text
+
+class TestText(unittest.TestCase):
+
+ def test_remove_html(self):
+ cases = (
+ "Hello World.",
+ " Hello World. ",
+ "Hello
World.",
+ "
HelloWorld.
"
+ )
+ result = "Hello World."
+ for case in cases:
+ self.assertEqual(text.remove_html(case), result)
+
+ def test_filename_from_url(self):
+ cases = (
+ "http://example.org/v2/filename.ext",
+ "http://example.org/v2/filename.ext?param=value#fragment",
+ "example.org/filename.ext",
+ "/filename.ext",
+ "filename.ext",
+ )
+ result = "filename.ext"
+ for case in cases:
+ self.assertEqual(text.filename_from_url(case), result)
+
+ def test_clean_path(self):
+ cases = {
+ "Hello World." : ("Hello World.", "Hello World."),
+ "Hello/World/.": ("Hello_World_.", "Hello_World_."),
+ r':|"World\*?': (
+ '_Hello____World___', r':|"World\*?'
+ ),
+ }
+ for case, result in cases.items():
+ self.assertEqual(text.clean_path_windows(case), result[0])
+ self.assertEqual(text.clean_path_posix (case), result[1])
+
+if __name__ == '__main__':
+ unittest.main()