1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-25 12:12:34 +01:00

Merge branch 'config' into loader

This commit is contained in:
Mike Fährmann 2015-10-05 17:46:04 +02:00
commit e23aaa4298
35 changed files with 532 additions and 351 deletions

55
.gitignore vendored
View File

@ -1,4 +1,57 @@
# Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/ build/
dist/ develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/

18
config
View File

@ -1,18 +0,0 @@
[pixiv]
username = XXXXX
password = XXXXX
[exhentai-cookies]
ipb_member_id = XXXXX
ipb_pass_hash = XXXXX
[nijie-cookies]
NIJIEIJIEID = XXXXX
nijie_email_hash = XXXXX
nijie_login_hash = XXXXX
[danbooru]
regex0 = d(?:anbooru)?[.:-_](\w.+)
[gelbooru]
regex0 = g(?:elbooru)?[.:-_](\w.+)

39
config.json Normal file
View File

@ -0,0 +1,39 @@
{
"base-directory": "/tmp/",
"extractor":
{
"pixiv":
{
"directory": ["{category}", "{artist-id}"],
"username": "XXX",
"password": "XXX"
},
"nijie":
{
"cookies":
{
"NIJIEIJIEID": "XXX",
"nijie_email_hash": "XXX",
"nijie_login_hash": "XXX"
}
},
"4chan":
{
"directory": ["{category}", "{board}", "{thread} - {title}"]
},
"danbooru":
{
"pattern": ["d(?:anbooru)?[.:-_](\\w.+)"],
"filename": "{category}_{id:>07}_{md5}.{extension}"
},
"gelbooru":
{
"pattern": ["g(?:elbooru)?[.:-_](\\w.+)"],
"filename": "{category}_{id:>07}_{md5}.{extension}"
},
"e621":
{
"pattern": ["e(?:621)?[.:-_](\\w.+)"]
}
}
}

View File

@ -1,23 +0,0 @@
Metadata-Version: 1.1
Name: gallery-dl
Version: 0.2
Summary: gallery- and image downloader
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
Author-email: mike_faehrmann@web.de
License: GPLv2
Description: download image galleries from several image hosting platforms
Platform: UNKNOWN
Classifier: Development Status :: 3 - Alpha
Classifier: Environment :: Console
Classifier: Intended Audience :: End Users/Desktop
Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
Classifier: Operating System :: POSIX
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.2
Classifier: Programming Language :: Python :: 3.3
Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
Classifier: Topic :: Multimedia
Classifier: Topic :: Multimedia :: Graphics

View File

@ -1,35 +0,0 @@
setup.py
bin/gallery-dl
gallery_dl/__init__.py
gallery_dl/download.py
gallery_dl.egg-info/PKG-INFO
gallery_dl.egg-info/SOURCES.txt
gallery_dl.egg-info/dependency_links.txt
gallery_dl.egg-info/entry_points.txt
gallery_dl.egg-info/requires.txt
gallery_dl.egg-info/top_level.txt
gallery_dl/downloader/__init__.py
gallery_dl/downloader/common.py
gallery_dl/downloader/http.py
gallery_dl/downloader/https.py
gallery_dl/downloader/text.py
gallery_dl/extractor/3dbooru.py
gallery_dl/extractor/4chan.py
gallery_dl/extractor/8chan.py
gallery_dl/extractor/__init__.py
gallery_dl/extractor/batoto.py
gallery_dl/extractor/booru.py
gallery_dl/extractor/common.py
gallery_dl/extractor/danbooru.py
gallery_dl/extractor/e621.py
gallery_dl/extractor/exhentai.py
gallery_dl/extractor/gelbooru.py
gallery_dl/extractor/imagebam.py
gallery_dl/extractor/imgbox.py
gallery_dl/extractor/imgchili.py
gallery_dl/extractor/mangareader.py
gallery_dl/extractor/nijie.py
gallery_dl/extractor/pixiv.py
gallery_dl/extractor/redhawkscans.py
gallery_dl/extractor/sankaku.py
gallery_dl/extractor/yandere.py

View File

@ -1 +0,0 @@

View File

@ -1,3 +0,0 @@
[console_scripts]
gallery-dl = gallery_dl:main

View File

@ -1 +0,0 @@
requests >= 2.0

View File

@ -1 +0,0 @@
gallery_dl

View File

@ -17,9 +17,7 @@ __email__ = "mike_faehrmann@web.de"
import os import os
import sys import sys
import argparse import argparse
import configparser from . import config, download
from .download import DownloadManager
def parse_cmdline_options(): def parse_cmdline_options():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@ -41,18 +39,10 @@ def parse_cmdline_options():
) )
return parser.parse_args() return parser.parse_args()
def parse_config_file(path):
config = configparser.ConfigParser(
interpolation=None,
)
config.optionxform = lambda opt: opt
config.read(os.path.expanduser(path))
return config
def main(): def main():
config.load()
opts = parse_cmdline_options() opts = parse_cmdline_options()
conf = parse_config_file(opts.config) dlmgr = download.DownloadManager(opts)
dlmgr = DownloadManager(opts, conf)
try: try:
for url in opts.urls: for url in opts.urls:

88
gallery_dl/config.py Normal file
View File

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
# Copyright 2015 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Global configuration module"""
import sys
import json
import os.path
import platform
# --------------------------------------------------------------------
# public interface
def load(*files):
"""Load JSON configuration files"""
configfiles = files or _default_configs
for conf in configfiles:
try:
path = os.path.expanduser(conf)
with open(path) as file:
confdict = json.load(file)
_config.update(confdict)
except FileNotFoundError:
continue
except json.decoder.JSONDecodeError as exception:
print("Error while loading '", path, "':", sep="", file=sys.stderr)
print(exception, file=sys.stderr)
def clear():
"""Reset configuration to en empty state"""
globals()["_config"] = {}
def get(keys, default=None):
"""Get the value of property 'key' or a default-value if it doenst exist"""
conf = _config
try:
for k in keys:
conf = conf[k]
return conf
except (KeyError, AttributeError):
return default
def interpolate(keys, default=None):
"""Interpolate the value of 'key'"""
conf = _config
try:
for k in keys:
default = conf.get(keys[-1], default)
conf = conf[k]
return conf
except (KeyError, AttributeError):
return default
def set(keys, value):
"""Set the value of property 'key' for this session"""
conf = _config
for k in keys[:-1]:
try:
conf = conf[k]
except KeyError:
temp = {}
conf[k] = temp
conf = temp
conf[keys[-1]] = value
# --------------------------------------------------------------------
# internals
_config = {}
if platform.system() == "Windows":
_default_configs = [
r"~\.config\gallery-dl.conf",
r"~\.gallery-dl.conf",
]
else:
_default_configs = [
"/etc/gallery-dl.conf",
"~/.config/gallery/config.json",
"~/.config/gallery-dl.conf",
"~/.gallery-dl.conf",
]

View File

@ -11,12 +11,12 @@ import sys
import importlib import importlib
from . import extractor from . import extractor
from .extractor.common import Message from .extractor.common import Message
from . import config
class DownloadManager(): class DownloadManager():
def __init__(self, opts, config): def __init__(self, opts):
self.opts = opts self.opts = opts
self.config = config
self.modules = {} self.modules = {}
def add(self, url): def add(self, url):
@ -36,7 +36,7 @@ class DownloadManager():
if self.opts.dest: if self.opts.dest:
return self.opts.dest return self.opts.dest
else: else:
return self.config.get("general", "destination", fallback="/tmp/") return config.get(("base-directory",), default="/tmp/")
class DownloadJob(): class DownloadJob():
@ -48,16 +48,14 @@ class DownloadJob():
return return
self.directory = mngr.get_base_directory() self.directory = mngr.get_base_directory()
self.downloaders = {} self.downloaders = {}
self.filename_fmt = mngr.config.get( self.filename_fmt = config.get(
self.info["category"], "filename", ("extractor", self.info["category"], "filename"),
fallback=self.info["filename"] default=self.info["filename"]
)
segments = config.get(
("extractor", self.info["category"], "directory"),
default=self.info["directory"]
) )
try:
segments = mngr.config.get(
self.info["category"], "directory"
).split("/")
except Exception:
segments = self.info["directory"]
self.directory_fmt = os.path.join(*segments) self.directory_fmt = os.path.join(*segments)
def run(self): def run(self):
@ -112,13 +110,11 @@ class DownloadJob():
scheme = url[:pos] if pos != -1 else "http" scheme = url[:pos] if pos != -1 else "http"
if scheme == "https": if scheme == "https":
scheme = "http" scheme = "http"
downloader = self.downloaders.get(scheme) downloader = self.downloaders.get(scheme)
if downloader is None: if downloader is None:
module = self.mngr.get_downloader_module(scheme) module = self.mngr.get_downloader_module(scheme)
downloader = module.Downloader() downloader = module.Downloader()
self.downloaders[scheme] = downloader self.downloaders[scheme] = downloader
return downloader return downloader
@staticmethod @staticmethod

View File

@ -22,8 +22,8 @@ info = {
class ThreeDeeBooruExtractor(JSONBooruExtractor): class ThreeDeeBooruExtractor(JSONBooruExtractor):
def __init__(self, match, config): def __init__(self, match):
JSONBooruExtractor.__init__(self, match, config, info) JSONBooruExtractor.__init__(self, match, info)
self.api_url = "http://behoimi.org/post/index.json" self.api_url = "http://behoimi.org/post/index.json"
self.headers = { self.headers = {
"Referer": "http://behoimi.org/post/show/", "Referer": "http://behoimi.org/post/show/",

View File

@ -8,65 +8,25 @@
"""Extract image- and video-urls from threads on https://www.4chan.org/""" """Extract image- and video-urls from threads on https://www.4chan.org/"""
from .common import SequentialExtractor, Message from .chan import ChanExtractor
from urllib.parse import unquote
import re
info = { info = {
"category": "4chan", "category": "4chan",
"extractor": "FourChanExtractor", "extractor": "FourChanExtractor",
"directory": ["{category}", "{board}-{thread-id}"], "directory": ["{category}", "{board}-{thread}"],
"filename": "{timestamp}-{name}", "filename": "{tim}-{filename}{ext}",
"pattern": [ "pattern": [
r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*", r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
], ],
} }
class FourChanExtractor(SequentialExtractor): class FourChanExtractor(ChanExtractor):
url_fmt = "https://boards.4chan.org/{0}/res/{1}.html" api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
regex = ( file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
r'<a (?:title="(?P<orig_name>[^"]+)" )?href="'
r'(?P<url>//i.4cdn.org/[^/]+/(?P<timestamp>\d+)\.(?P<extension>[^"]+))'
r'" target="_blank">(?P<name>[^<]+)</a> '
r'\((?P<size>[^,]+), (?P<width>\d+)x(?P<height>\d+)\)'
)
def __init__(self, match, config): def __init__(self, match):
SequentialExtractor.__init__(self, config) ChanExtractor.__init__(
self.match = match self, info["category"],
self.metadata = None match.group(1), match.group(2)
)
def items(self):
yield Message.Version, 1
url = self.url_fmt.format(*self.match.groups())
text = self.request(url).text
self.metadata = self.get_job_metadata(text)
yield Message.Directory, self.metadata
for match in re.finditer(self.regex, text):
yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
def get_job_metadata(self, text):
"""Collect metadata for extractor-job"""
board, thread_id = self.match.groups()
title, _ = self.extract(text, '"description" content="', ' - &quot;/')
return {
"category": info["category"],
"board": board,
"thread-id": thread_id,
"title": unquote(title),
}
def get_file_metadata(self, match):
"""Collect metadata for a downloadable file"""
data = self.metadata
data.update(match.groupdict(default=""))
data["name"] = unquote(data["orig_name"] or data["name"])
return data
@staticmethod
def get_file_url(match):
"""Extract download-url from 'match'"""
return "https:" + match.group("url")

View File

@ -8,65 +8,25 @@
"""Extract image- and video-urls from threads on https://8ch.net/""" """Extract image- and video-urls from threads on https://8ch.net/"""
from .common import SequentialExtractor, Message from .chan import ChanExtractor
from urllib.parse import unquote
import re
info = { info = {
"category": "8chan", "category": "8chan",
"extractor": "InfinityChanExtractor", "extractor": "InfinityChanExtractor",
"directory": ["{category}", "{board}-{thread-id}"], "directory": ["{category}", "{board}-{thread}"],
"filename": "{timestamp}-{name}", "filename": "{tim}-{filename}{ext}",
"pattern": [ "pattern": [
r"(?:https?://)?(?:www\.)?(?:8chan\.co|8ch\.net)/([^/]+/res/\d+).*", r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*",
], ],
} }
class InfinityChanExtractor(SequentialExtractor): class InfinityChanExtractor(ChanExtractor):
url_base = "https://8ch.net" api_url = "https://8ch.net/{board}/res/{thread}.json"
url_fmt = url_base + "/{board}/res/{thread-id}.html" file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
regex = (
r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?'
r'<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
)
def __init__(self, match, config):
SequentialExtractor.__init__(self, config)
self.match = match
def items(self):
yield Message.Version, 1
metadata = self.get_job_metadata()
yield Message.Directory, metadata
url = self.url_fmt.format(**metadata)
text = self.request(url).text
for match in re.finditer(self.regex, text):
yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
board, _, thread_id = self.match.group(1).split("/")
return {
"category": info["category"],
"board": board,
"thread-id": thread_id,
}
@staticmethod
def get_file_metadata(match):
"""Collect metadata for a downloadable file"""
return {
"timestamp": match.group(2),
"name": unquote(match.group(4) or match.group(5)),
}
def get_file_url(self, match):
"""Extract download-url from 'match'"""
url = match.group(1)
if url.startswith("/"):
url = self.url_base + url
return url
def __init__(self, match):
ChanExtractor.__init__(
self, info["category"],
match.group(1), match.group(2)
)

View File

@ -8,10 +8,8 @@
"""Extract manga pages from http://bato.to/""" """Extract manga pages from http://bato.to/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url, unescape
from urllib.parse import unquote
import os.path import os.path
import re import re
@ -29,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor):
url_base = "http://bato.to/read/_/" url_base = "http://bato.to/read/_/"
def __init__(self, match, config): def __init__(self, match):
AsynchronousExtractor.__init__(self, config) AsynchronousExtractor.__init__(self)
self.chapter_id = match.group(1) self.chapter_id = match.group(1)
def items(self): def items(self):
@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor):
def get_page_metadata(self, page_url): def get_page_metadata(self, page_url):
"""Collect next url and metadata for one manga-page""" """Collect next url and metadata for one manga-page"""
page = self.request(page_url).text page = self.request(page_url).text
_ , pos = self.extract(page, 'selected="selected"', '') _ , pos = text.extract(page, 'selected="selected"', '')
title, pos = self.extract(page, ': ', '<', pos) title, pos = text.extract(page, ': ', '<', pos)
_ , pos = self.extract(page, 'selected="selected"', '', pos) _ , pos = text.extract(page, 'selected="selected"', '', pos)
trans, pos = self.extract(page, '>', '<', pos) trans, pos = text.extract(page, '>', '<', pos)
_ , pos = self.extract(page, '<div id="full_image"', '', pos) _ , pos = text.extract(page, '<div id="full_image"', '', pos)
image, pos = self.extract(page, '<img src="', '"', pos) image, pos = text.extract(page, '<img src="', '"', pos)
url , pos = self.extract(page, '<a href="', '"', pos) url , pos = text.extract(page, '<a href="', '"', pos)
mmatch = re.search( mmatch = re.search(
r"<title>(.+) - (?:vol (\d+) )?" r"<title>(.+) - (?:vol (\d+) )?"
r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>", r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>",
@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor):
r"(.+) - ([^ ]+)", r"(.+) - ([^ ]+)",
trans trans
) )
filename = unquote(filename_from_url(image)) filename = text.unquote(text.filename_from_url(image))
name, ext = os.path.splitext(filename) name, ext = os.path.splitext(filename)
return url, { return url, {
"category": info["category"], "category": info["category"],
"chapter-id": self.chapter_id, "chapter-id": self.chapter_id,
"manga": unescape(mmatch.group(1)), "manga": text.unescape(mmatch.group(1)),
"volume": mmatch.group(2) or "", "volume": mmatch.group(2) or "",
"chapter": mmatch.group(3), "chapter": mmatch.group(3),
"page": mmatch.group(4), "page": mmatch.group(4),
"group": tmatch.group(1), "group": tmatch.group(1),
"language": tmatch.group(2), "language": tmatch.group(2),
"title": unescape(title), "title": text.unescape(title),
"image-url": image, "image-url": image,
"name": name, "name": name,
"extension": ext[1:], "extension": ext[1:],

View File

@ -8,23 +8,21 @@
"""Base classes for extractors for danbooru and co""" """Base classes for extractors for danbooru and co"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import json import json
import os.path import os.path
import urllib.parse import urllib.parse
class BooruExtractor(SequentialExtractor): class BooruExtractor(SequentialExtractor):
api_url = "" api_url = ""
def __init__(self, match, config, info): def __init__(self, match, info):
SequentialExtractor.__init__(self, config) SequentialExtractor.__init__(self)
self.info = info self.info = info
self.tags = urllib.parse.unquote(match.group(1)) self.tags = text.unquote(match.group(1))
self.page = "page" self.page = "page"
self.params = {"tags": self.tags} self.params = {"tags": self.tags}
self.headers = {} self.headers = {}
@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor):
def get_file_metadata(self, data): def get_file_metadata(self, data):
"""Collect metadata for a downloadable file""" """Collect metadata for a downloadable file"""
data["category"] = self.info["category"] data["category"] = self.info["category"]
data["name"] = urllib.parse.unquote( data["name"] = text.unquote(
filename_from_url(self.get_file_url(data)) text.filename_from_url(self.get_file_url(data))
) )
data["extension"] = os.path.splitext(data["name"])[1][1:] data["extension"] = os.path.splitext(data["name"])[1][1:]
return data return data

View File

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# Copyright 2015 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Base classes for extractors for different Futaba Channel boards"""
from .common import SequentialExtractor, Message
from .. import text
class ChanExtractor(SequentialExtractor):
api_url = ""
file_url = ""
def __init__(self, category, board, thread):
SequentialExtractor.__init__(self)
self.metadata = {
"category": category,
"board": board,
"thread": thread,
}
def items(self):
yield Message.Version, 1
posts = self.request(self.api_url.format(**self.metadata)).json()["posts"]
self.metadata["title"] = self.get_thread_title(posts[0])
yield Message.Directory, self.metadata
for post in posts:
if "filename" not in post:
continue
post.update(self.metadata)
yield Message.Url, self.file_url.format(**post), post
if "extra_files" in post:
for file in post["extra_files"]:
post.update(file)
yield Message.Url, self.file_url.format(**post), post
@staticmethod
def get_thread_title(post):
"""Return thread title from first post"""
if "sub" in post:
return post["sub"]
return text.remove_html(post["com"])[:50]

View File

@ -12,7 +12,7 @@ import time
import queue import queue
import requests import requests
import threading import threading
import html.parser from .. import config
class Message(): class Message():
@ -44,36 +44,18 @@ class Extractor():
"Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0" "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
) )
@staticmethod
def extract(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except ValueError:
return None, pos
@staticmethod
def extract_all(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos)
last = txt.index(end, first + len(begin)) + len(end)
return txt[first:last], last
except ValueError:
return None, pos
class SequentialExtractor(Extractor): class SequentialExtractor(Extractor):
def __init__(self, _): def __init__(self):
Extractor.__init__(self) Extractor.__init__(self)
class AsynchronousExtractor(Extractor): class AsynchronousExtractor(Extractor):
def __init__(self, config): def __init__(self):
Extractor.__init__(self) Extractor.__init__(self)
queue_size = int(config.get("general", "queue-size", fallback=5)) queue_size = int(config.get(("queue-size",), default=5))
self.__queue = queue.Queue(maxsize=queue_size) self.__queue = queue.Queue(maxsize=queue_size)
self.__thread = threading.Thread(target=self.async_items, daemon=True) self.__thread = threading.Thread(target=self.async_items, daemon=True)
@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs):
# everything ok -- proceed to download # everything ok -- proceed to download
return r return r
def filename_from_url(url):
pos = url.rfind("/")
return url[pos+1:]
unescape = html.parser.HTMLParser().unescape

View File

@ -22,6 +22,6 @@ info = {
class DanbooruExtractor(JSONBooruExtractor): class DanbooruExtractor(JSONBooruExtractor):
def __init__(self, match, config): def __init__(self, match):
JSONBooruExtractor.__init__(self, match, config, info) JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://danbooru.donmai.us/posts.json" self.api_url = "https://danbooru.donmai.us/posts.json"

View File

@ -23,6 +23,6 @@ info = {
class E621Extractor(JSONBooruExtractor): class E621Extractor(JSONBooruExtractor):
def __init__(self, match, config): def __init__(self, match):
JSONBooruExtractor.__init__(self, match, config, info) JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://e621.net/post/index.json" self.api_url = "https://e621.net/post/index.json"

View File

@ -22,8 +22,8 @@ info = {
class GelbooruExtractor(XMLBooruExtractor): class GelbooruExtractor(XMLBooruExtractor):
def __init__(self, match, config): def __init__(self, match):
XMLBooruExtractor.__init__(self, match, config, info) XMLBooruExtractor.__init__(self, match, info)
self.api_url = "http://gelbooru.com/" self.api_url = "http://gelbooru.com/"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags} self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}

View File

@ -8,9 +8,8 @@
"""Extract images from galleries at http://www.imagebam.com/""" """Extract images from galleries at http://www.imagebam.com/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url
info = { info = {
"category": "imagebam", "category": "imagebam",
@ -26,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):
url_base = "http://www.imagebam.com" url_base = "http://www.imagebam.com"
def __init__(self, match, config): def __init__(self, match):
AsynchronousExtractor.__init__(self, config) AsynchronousExtractor.__init__(self)
self.match = match self.match = match
self.num = 0 self.num = 0
self.metadata = {} self.metadata = {}
@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor):
done = False done = False
while not done: while not done:
# get current page # get current page
text = self.request(self.url_base + next_url).text page = self.request(self.url_base + next_url).text
# get url for next page # get url for next page
next_url, pos = self.extract(text, "<a class='buttonblue' href='", "'") next_url, pos = text.extract(page, "<a class='buttonblue' href='", "'")
# if the following text isn't "><span>next image" we are done # if the following text isn't "><span>next image" we are done
if not text.startswith("><span>next image", pos): if not page.startswith("><span>next image", pos):
done = True done = True
# get image url # get image url
img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos) img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)
yield Message.Url, img_url, self.get_file_metadata(img_url) yield Message.Url, img_url, self.get_file_metadata(img_url)
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
gallery_key = self.match.group(2) gallery_key = self.match.group(2)
text = self.request(self.url_base + "/gallery/" + gallery_key).text page = self.request(self.url_base + "/gallery/" + gallery_key).text
_ , pos = self.extract(text, "<img src='/img/icons/photos.png'", "") _ , pos = text.extract(page, "<img src='/img/icons/photos.png'", "")
title, pos = self.extract(text, "'> ", " <", pos) title, pos = text.extract(page, "'> ", " <", pos)
count, pos = self.extract(text, "'>", " images", pos) count, pos = text.extract(page, "'>", " images", pos)
url , pos = self.extract(text, "<a href='http://www.imagebam.com", "'", pos) url , pos = text.extract(page, "<a href='http://www.imagebam.com", "'", pos)
return { return {
"category": info["category"], "category": info["category"],
"key": gallery_key, "key": gallery_key,
@ -77,5 +76,5 @@ class ImagebamExtractor(AsynchronousExtractor):
self.num += 1 self.num += 1
data = self.metadata.copy() data = self.metadata.copy()
data["num"] = self.num data["num"] = self.num
data["name"] = filename_from_url(url) data["name"] = text.filename_from_url(url)
return data return data

View File

@ -9,6 +9,7 @@
"""Extract images from galleries at http://imgbox.com/""" """Extract images from galleries at http://imgbox.com/"""
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text
import re import re
info = { info = {
@ -25,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor):
url_base = "http://imgbox.com" url_base = "http://imgbox.com"
def __init__(self, match, config): def __init__(self, match):
AsynchronousExtractor.__init__(self, config) AsynchronousExtractor.__init__(self)
self.key = match.group(1) self.key = match.group(1)
self.metadata = {} self.metadata = {}
@ -36,8 +37,8 @@ class ImgboxExtractor(AsynchronousExtractor):
yield Message.Version, 1 yield Message.Version, 1
yield Message.Directory, self.metadata yield Message.Directory, self.metadata
for match in re.finditer(r'<a href="([^"]+)"><img alt="', page): for match in re.finditer(r'<a href="([^"]+)"><img alt="', page):
text = self.request(self.url_base + match.group(1)).text imgpage = self.request(self.url_base + match.group(1)).text
yield Message.Url, self.get_file_url(text), self.get_file_metadata(text) yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
@ -51,16 +52,16 @@ class ImgboxExtractor(AsynchronousExtractor):
"count": match.group(4), "count": match.group(4),
} }
def get_file_metadata(self, text): def get_file_metadata(self, page):
"""Collect metadata for a downloadable file""" """Collect metadata for a downloadable file"""
data = self.metadata.copy() data = self.metadata.copy()
data["num"] , pos = self.extract(text, '</a> &nbsp; ', ' of ') data["num"] , pos = text.extract(page, '</a> &nbsp; ', ' of ')
data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos) data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos)
data["name"] , pos = self.extract(text, ' title="', '"', pos) data["name"] , pos = text.extract(page, ' title="', '"', pos)
return data return data
def get_file_url(self, text): def get_file_url(self, page):
"""Extract download-url""" """Extract download-url"""
base = "http://i.imgbox.com/" base = "http://i.imgbox.com/"
path, _ = self.extract(text, base, '"') path, _ = text.extract(page, base, '"')
return base + path return base + path

View File

@ -8,9 +8,8 @@
"""Extract images from albums at http://imgchili.net/""" """Extract images from albums at http://imgchili.net/"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url
import re import re
info = { info = {
@ -25,8 +24,8 @@ info = {
class ImgchiliExtractor(SequentialExtractor): class ImgchiliExtractor(SequentialExtractor):
def __init__(self, match, config): def __init__(self, match):
SequentialExtractor.__init__(self, config) SequentialExtractor.__init__(self)
self.match = match self.match = match
self.num = 0 self.num = 0
@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
title = self.extract(page, "<h1>", "</h1>")[0] title = text.extract(page, "<h1>", "</h1>")[0]
return { return {
"category": info["category"], "category": info["category"],
"title": title, "title": title,

View File

@ -8,10 +8,8 @@
"""Extract manga pages from http://www.mangareader.net/""" """Extract manga pages from http://www.mangareader.net/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import text
from .common import unescape, filename_from_url
from urllib.parse import unquote
import os.path import os.path
import re import re
@ -30,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):
url_base = "http://www.mangareader.net" url_base = "http://www.mangareader.net"
def __init__(self, match, config): def __init__(self, match):
AsynchronousExtractor.__init__(self, config) AsynchronousExtractor.__init__(self)
self.part = match.group(1) self.part = match.group(1)
def items(self): def items(self):
@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor):
def get_page_metadata(self, page_url): def get_page_metadata(self, page_url):
"""Collect next url, image-url and metadata for one manga-page""" """Collect next url, image-url and metadata for one manga-page"""
page = self.request(page_url).text page = self.request(page_url).text
extr = self.extract extr = text.extract
width = None width = None
descr, pos = extr(page, '<meta name="description" content="', '"') descr, pos = extr(page, '<meta name="description" content="', '"')
test , pos = extr(page, "document['pu']", '', pos) test , pos = extr(page, "document['pu']", '', pos)
@ -62,13 +60,13 @@ class MangaReaderExtractor(AsynchronousExtractor):
width , pos = extr(page, '<img id="img" width="', '"', pos) width , pos = extr(page, '<img id="img" width="', '"', pos)
height, pos = extr(page, ' height="', '"', pos) height, pos = extr(page, ' height="', '"', pos)
image, pos = extr(page, ' src="', '"', pos) image, pos = extr(page, ' src="', '"', pos)
filename = unquote(filename_from_url(image)) filename = text.unquote(text.filename_from_url(image))
name, ext = os.path.splitext(filename) name, ext = os.path.splitext(filename)
match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr) match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr)
return self.url_base + url, image, { return self.url_base + url, image, {
"category": info["category"], "category": info["category"],
"manga": unescape(match.group(1)), "manga": text.unescape(match.group(1)),
"chapter": match.group(2), "chapter": match.group(2),
"page": match.group(3), "page": match.group(3),
"width": width, "width": width,

View File

@ -8,9 +8,8 @@
"""Extract images from https://nijie.info/""" """Extract images from https://nijie.info/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import config, text
from .common import filename_from_url
import re import re
info = { info = {
@ -27,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor):
popup_url = "https://nijie.info/view_popup.php?id=" popup_url = "https://nijie.info/view_popup.php?id="
def __init__(self, match, config): def __init__(self, match):
AsynchronousExtractor.__init__(self, config) AsynchronousExtractor.__init__(self)
self.artist_id = match.group(1) self.artist_id = match.group(1)
self.artist_url = ( self.artist_url = (
"https://nijie.info/members_illust.php?id=" "https://nijie.info/members_illust.php?id="
@ -37,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor):
self.session.headers["Referer"] = self.artist_url self.session.headers["Referer"] = self.artist_url
self.session.cookies["R18"] = "1" self.session.cookies["R18"] = "1"
self.session.cookies["nijie_referer"] = "nijie.info" self.session.cookies["nijie_referer"] = "nijie.info"
self.session.cookies.update(config["nijie-cookies"]) self.session.cookies.update(
config.get(("extractor", info["category"], "cookies"))
)
def items(self): def items(self):
data = self.get_job_metadata() data = self.get_job_metadata()
@ -56,19 +57,20 @@ class NijieExtractor(AsynchronousExtractor):
} }
def get_image_ids(self): def get_image_ids(self):
text = self.request(self.artist_url).text """Collect all image-ids for a specific artist"""
page = self.request(self.artist_url).text
regex = r'<a href="/view\.php\?id=(\d+)"' regex = r'<a href="/view\.php\?id=(\d+)"'
return [m.group(1) for m in re.finditer(regex, text)] return [m.group(1) for m in re.finditer(regex, page)]
def get_image_data(self, image_id): def get_image_data(self, image_id):
"""Get URL and metadata for images specified by 'image_id'""" """Get URL and metadata for images specified by 'image_id'"""
text = self.request(self.popup_url + image_id).text page = self.request(self.popup_url + image_id).text
matches = re.findall('<img src="([^"]+)"', text) matches = re.findall('<img src="([^"]+)"', page)
for index, url in enumerate(matches): for index, url in enumerate(matches):
yield "https:" + url, { yield "https:" + url, {
"count": len(matches), "count": len(matches),
"index": index, "index": index,
"image-id": image_id, "image-id": image_id,
"name" : filename_from_url(url), "name" : text.filename_from_url(url),
"extension": url[url.rfind(".")+1:], "extension": url[url.rfind(".")+1:],
} }

View File

@ -8,8 +8,8 @@
"""Extract images and ugoira from http://www.pixiv.net/""" """Extract images and ugoira from http://www.pixiv.net/"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import config, text
import re import re
import json import json
@ -29,16 +29,15 @@ class PixivExtractor(SequentialExtractor):
member_url = "http://www.pixiv.net/member_illust.php" member_url = "http://www.pixiv.net/member_illust.php"
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium" illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self, match, config): def __init__(self, match):
SequentialExtractor.__init__(self, config) SequentialExtractor.__init__(self)
self.config = config
self.artist_id = match.group(1) self.artist_id = match.group(1)
self.api = PixivAPI(self.session) self.api = PixivAPI(self.session)
def items(self): def items(self):
self.api.login( self.api.login(
self.config.get("pixiv", "username"), config.get(("extractor", "pixiv", "username")),
self.config.get("pixiv", "password"), config.get(("extractor", "pixiv", "password")),
) )
metadata = self.get_job_metadata() metadata = self.get_job_metadata()
@ -84,9 +83,9 @@ class PixivExtractor(SequentialExtractor):
def get_works(self): def get_works(self):
"""Yield all work-items for a pixiv-member""" """Yield all work-items for a pixiv-member"""
page = 1 pagenum = 1
while True: while True:
data = self.api.user_works(self.artist_id, page) data = self.api.user_works(self.artist_id, pagenum)
for work in data["response"]: for work in data["response"]:
url = work["image_urls"]["large"] url = work["image_urls"]["large"]
work["num"] = "" work["num"] = ""
@ -96,17 +95,17 @@ class PixivExtractor(SequentialExtractor):
pinfo = data["pagination"] pinfo = data["pagination"]
if pinfo["current"] == pinfo["pages"]: if pinfo["current"] == pinfo["pages"]:
return return
page = pinfo["next"] pagenum = pinfo["next"]
def parse_ugoira(self, data): def parse_ugoira(self, data):
"""Parse ugoira data""" """Parse ugoira data"""
# get illust page # get illust page
text = self.request( page = self.request(
self.illust_url, params={"illust_id": data["id"]}, self.illust_url, params={"illust_id": data["id"]},
).text ).text
# parse page # parse page
frames, _ = self.extract(text, ',"frames":[', ']') frames, _ = text.extract(page, ',"frames":[', ']')
# build url # build url
url = re.sub( url = re.sub(
@ -146,7 +145,7 @@ class PixivAPI():
self.session = session self.session = session
self.session.headers.update({ self.session.headers.update({
"Referer": "http://www.pixiv.net/", "Referer": "http://www.pixiv.net/",
"User-Agent": "PixivIOSApp/5.1.1", "User-Agent": "PixivIOSApp/5.8.0",
# "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU", # "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU",
}) })

View File

@ -8,9 +8,8 @@
"""Extract manga pages from http://manga.redhawkscans.com/""" """Extract manga pages from http://manga.redhawkscans.com/"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
from .common import unescape
import os.path import os.path
import json import json
import re import re
@ -29,8 +28,8 @@ class RedHawkScansExtractor(SequentialExtractor):
url_base = "https://manga.redhawkscans.com/reader/read/" url_base = "https://manga.redhawkscans.com/reader/read/"
def __init__(self, match, config): def __init__(self, match):
SequentialExtractor.__init__(self, config) SequentialExtractor.__init__(self)
self.part = match.group(1) self.part = match.group(1)
def items(self): def items(self):
@ -50,16 +49,16 @@ class RedHawkScansExtractor(SequentialExtractor):
response = self.request(self.url_base + self.part) response = self.request(self.url_base + self.part)
response.encoding = "utf-8" response.encoding = "utf-8"
page = response.text page = response.text
_ , pos = self.extract(page, '<h1 class="tbtitle dnone">', '') _ , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
manga , pos = self.extract(page, 'title="', '"', pos) manga , pos = text.extract(page, 'title="', '"', pos)
chapter , pos = self.extract(page, '">', '</a>', pos) chapter , pos = text.extract(page, '">', '</a>', pos)
json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos) json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos)
match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter) match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
return { return {
"category": info["category"], "category": info["category"],
"manga": unescape(manga), "manga": text.unescape(manga),
"chapter": match.group(2) or match.group(1), "chapter": match.group(2) or match.group(1),
"chapter-minor": match.group(3) or "", "chapter-minor": match.group(3) or "",
"language": "English", "language": "English",
"title": unescape(match.group(4) or ""), "title": text.unescape(match.group(4) or ""),
}, json.loads(json_data) }, json.loads(json_data)

View File

@ -22,6 +22,6 @@ info = {
class YandereExtractor(JSONBooruExtractor): class YandereExtractor(JSONBooruExtractor):
def __init__(self, match, config): def __init__(self, match):
JSONBooruExtractor.__init__(self, match, config, info) JSONBooruExtractor.__init__(self, match, info)
self.api_url = "https://yande.re/post.json" self.api_url = "https://yande.re/post.json"

60
gallery_dl/text.py Normal file
View File

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
# Copyright 2015 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Collection of functions that work in strings/text"""
import re
import html.parser
import urllib.parse
import platform
def remove_html(text):
"""Remove html-tags from a string"""
return " ".join(re.sub("<[^>]+?>", " ", text).split())
def filename_from_url(url):
"""Extract the last part of an url to use as a filename"""
try:
path = urllib.parse.urlparse(url).path
pos = path.rindex("/")
return path[pos+1:]
except ValueError:
return url
def clean_path_windows(path):
"""Remove illegal characters from a path-segment (Windows)"""
return re.sub(r'[<>:"\\/|?*]', "_", path)
def clean_path_posix(path):
"""Remove illegal characters from a path-segment (Posix)"""
return path.replace("/", "_")
def extract(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except ValueError:
return None, pos
def extract_all(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos)
last = txt.index(end, first + len(begin)) + len(end)
return txt[first:last], last
except ValueError:
return None, pos
if platform.system() == "Windows":
clean_path = clean_path_windows
else:
clean_path = clean_path_posix
unquote = urllib.parse.unquote
unescape = html.parser.HTMLParser().unescape

View File

@ -46,4 +46,5 @@ setup(
"Topic :: Multimedia", "Topic :: Multimedia",
"Topic :: Multimedia :: Graphics", "Topic :: Multimedia :: Graphics",
], ],
test_suite='test',
) )

0
test/__init__.py Normal file
View File

49
test/test_config.py Normal file
View File

@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2015 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import unittest
import gallery_dl.config as config
import os
import tempfile
class TestConfig(unittest.TestCase):
def setUp(self):
fd, self._configfile = tempfile.mkstemp()
with os.fdopen(fd, "w") as file:
file.write('{"a": "1", "b": {"c": "text"}}')
config.load(self._configfile)
def tearDown(self):
config.clear()
os.remove(self._configfile)
def test_get(self):
self.assertEqual(config.get(["a"]), "1")
self.assertEqual(config.get(["b", "c"]), "text")
self.assertEqual(config.get(["d"]), None)
self.assertEqual(config.get(["e", "f", "g"], 123), 123)
def test_set(self):
config.set(["b", "c"], [1, 2, 3])
config.set(["e", "f", "g"], value=234)
self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
self.assertEqual(config.get(["e", "f", "g"]), 234)
def test_interpolate(self):
self.assertEqual(config.interpolate(["a"]), "1")
self.assertEqual(config.interpolate(["b", "a"]), "1")
self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
config.set(["d"], 123)
self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
if __name__ == '__main__':
unittest.main()

51
test/test_text.py Normal file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2015 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import unittest
import gallery_dl.text as text
class TestText(unittest.TestCase):
def test_remove_html(self):
cases = (
"Hello World.",
" Hello World. ",
"Hello<br/>World.",
"<div><span class='a'>Hello</span><strong>World.</strong></div>"
)
result = "Hello World."
for case in cases:
self.assertEqual(text.remove_html(case), result)
def test_filename_from_url(self):
cases = (
"http://example.org/v2/filename.ext",
"http://example.org/v2/filename.ext?param=value#fragment",
"example.org/filename.ext",
"/filename.ext",
"filename.ext",
)
result = "filename.ext"
for case in cases:
self.assertEqual(text.filename_from_url(case), result)
def test_clean_path(self):
cases = {
"Hello World." : ("Hello World.", "Hello World."),
"Hello/World/.": ("Hello_World_.", "Hello_World_."),
r'<Hello>:|"World\*?': (
'_Hello____World___', r'<Hello>:|"World\*?'
),
}
for case, result in cases.items():
self.assertEqual(text.clean_path_windows(case), result[0])
self.assertEqual(text.clean_path_posix (case), result[1])
if __name__ == '__main__':
unittest.main()