mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-25 12:12:34 +01:00
Merge branch 'config' into loader
This commit is contained in:
commit
e23aaa4298
53
.gitignore
vendored
53
.gitignore
vendored
@ -1,4 +1,57 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
build/
|
build/
|
||||||
|
develop-eggs/
|
||||||
dist/
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*,cover
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
18
config
18
config
@ -1,18 +0,0 @@
|
|||||||
[pixiv]
|
|
||||||
username = XXXXX
|
|
||||||
password = XXXXX
|
|
||||||
|
|
||||||
[exhentai-cookies]
|
|
||||||
ipb_member_id = XXXXX
|
|
||||||
ipb_pass_hash = XXXXX
|
|
||||||
|
|
||||||
[nijie-cookies]
|
|
||||||
NIJIEIJIEID = XXXXX
|
|
||||||
nijie_email_hash = XXXXX
|
|
||||||
nijie_login_hash = XXXXX
|
|
||||||
|
|
||||||
[danbooru]
|
|
||||||
regex0 = d(?:anbooru)?[.:-_](\w.+)
|
|
||||||
|
|
||||||
[gelbooru]
|
|
||||||
regex0 = g(?:elbooru)?[.:-_](\w.+)
|
|
39
config.json
Normal file
39
config.json
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
{
|
||||||
|
"base-directory": "/tmp/",
|
||||||
|
"extractor":
|
||||||
|
{
|
||||||
|
"pixiv":
|
||||||
|
{
|
||||||
|
"directory": ["{category}", "{artist-id}"],
|
||||||
|
"username": "XXX",
|
||||||
|
"password": "XXX"
|
||||||
|
},
|
||||||
|
"nijie":
|
||||||
|
{
|
||||||
|
"cookies":
|
||||||
|
{
|
||||||
|
"NIJIEIJIEID": "XXX",
|
||||||
|
"nijie_email_hash": "XXX",
|
||||||
|
"nijie_login_hash": "XXX"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"4chan":
|
||||||
|
{
|
||||||
|
"directory": ["{category}", "{board}", "{thread} - {title}"]
|
||||||
|
},
|
||||||
|
"danbooru":
|
||||||
|
{
|
||||||
|
"pattern": ["d(?:anbooru)?[.:-_](\\w.+)"],
|
||||||
|
"filename": "{category}_{id:>07}_{md5}.{extension}"
|
||||||
|
},
|
||||||
|
"gelbooru":
|
||||||
|
{
|
||||||
|
"pattern": ["g(?:elbooru)?[.:-_](\\w.+)"],
|
||||||
|
"filename": "{category}_{id:>07}_{md5}.{extension}"
|
||||||
|
},
|
||||||
|
"e621":
|
||||||
|
{
|
||||||
|
"pattern": ["e(?:621)?[.:-_](\\w.+)"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,23 +0,0 @@
|
|||||||
Metadata-Version: 1.1
|
|
||||||
Name: gallery-dl
|
|
||||||
Version: 0.2
|
|
||||||
Summary: gallery- and image downloader
|
|
||||||
Home-page: https://github.com/mikf/gallery-dl
|
|
||||||
Author: Mike Fährmann
|
|
||||||
Author-email: mike_faehrmann@web.de
|
|
||||||
License: GPLv2
|
|
||||||
Description: download image galleries from several image hosting platforms
|
|
||||||
Platform: UNKNOWN
|
|
||||||
Classifier: Development Status :: 3 - Alpha
|
|
||||||
Classifier: Environment :: Console
|
|
||||||
Classifier: Intended Audience :: End Users/Desktop
|
|
||||||
Classifier: License :: OSI Approved :: GNU General Public License v2 (GPLv2)
|
|
||||||
Classifier: Operating System :: POSIX
|
|
||||||
Classifier: Programming Language :: Python :: 3
|
|
||||||
Classifier: Programming Language :: Python :: 3.2
|
|
||||||
Classifier: Programming Language :: Python :: 3.3
|
|
||||||
Classifier: Programming Language :: Python :: 3.4
|
|
||||||
Classifier: Programming Language :: Python :: 3 :: Only
|
|
||||||
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
||||||
Classifier: Topic :: Multimedia
|
|
||||||
Classifier: Topic :: Multimedia :: Graphics
|
|
@ -1,35 +0,0 @@
|
|||||||
setup.py
|
|
||||||
bin/gallery-dl
|
|
||||||
gallery_dl/__init__.py
|
|
||||||
gallery_dl/download.py
|
|
||||||
gallery_dl.egg-info/PKG-INFO
|
|
||||||
gallery_dl.egg-info/SOURCES.txt
|
|
||||||
gallery_dl.egg-info/dependency_links.txt
|
|
||||||
gallery_dl.egg-info/entry_points.txt
|
|
||||||
gallery_dl.egg-info/requires.txt
|
|
||||||
gallery_dl.egg-info/top_level.txt
|
|
||||||
gallery_dl/downloader/__init__.py
|
|
||||||
gallery_dl/downloader/common.py
|
|
||||||
gallery_dl/downloader/http.py
|
|
||||||
gallery_dl/downloader/https.py
|
|
||||||
gallery_dl/downloader/text.py
|
|
||||||
gallery_dl/extractor/3dbooru.py
|
|
||||||
gallery_dl/extractor/4chan.py
|
|
||||||
gallery_dl/extractor/8chan.py
|
|
||||||
gallery_dl/extractor/__init__.py
|
|
||||||
gallery_dl/extractor/batoto.py
|
|
||||||
gallery_dl/extractor/booru.py
|
|
||||||
gallery_dl/extractor/common.py
|
|
||||||
gallery_dl/extractor/danbooru.py
|
|
||||||
gallery_dl/extractor/e621.py
|
|
||||||
gallery_dl/extractor/exhentai.py
|
|
||||||
gallery_dl/extractor/gelbooru.py
|
|
||||||
gallery_dl/extractor/imagebam.py
|
|
||||||
gallery_dl/extractor/imgbox.py
|
|
||||||
gallery_dl/extractor/imgchili.py
|
|
||||||
gallery_dl/extractor/mangareader.py
|
|
||||||
gallery_dl/extractor/nijie.py
|
|
||||||
gallery_dl/extractor/pixiv.py
|
|
||||||
gallery_dl/extractor/redhawkscans.py
|
|
||||||
gallery_dl/extractor/sankaku.py
|
|
||||||
gallery_dl/extractor/yandere.py
|
|
@ -1 +0,0 @@
|
|||||||
|
|
@ -1,3 +0,0 @@
|
|||||||
[console_scripts]
|
|
||||||
gallery-dl = gallery_dl:main
|
|
||||||
|
|
@ -1 +0,0 @@
|
|||||||
requests >= 2.0
|
|
@ -1 +0,0 @@
|
|||||||
gallery_dl
|
|
@ -17,9 +17,7 @@ __email__ = "mike_faehrmann@web.de"
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
import configparser
|
from . import config, download
|
||||||
|
|
||||||
from .download import DownloadManager
|
|
||||||
|
|
||||||
def parse_cmdline_options():
|
def parse_cmdline_options():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -41,18 +39,10 @@ def parse_cmdline_options():
|
|||||||
)
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
def parse_config_file(path):
|
|
||||||
config = configparser.ConfigParser(
|
|
||||||
interpolation=None,
|
|
||||||
)
|
|
||||||
config.optionxform = lambda opt: opt
|
|
||||||
config.read(os.path.expanduser(path))
|
|
||||||
return config
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
config.load()
|
||||||
opts = parse_cmdline_options()
|
opts = parse_cmdline_options()
|
||||||
conf = parse_config_file(opts.config)
|
dlmgr = download.DownloadManager(opts)
|
||||||
dlmgr = DownloadManager(opts, conf)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for url in opts.urls:
|
for url in opts.urls:
|
||||||
|
88
gallery_dl/config.py
Normal file
88
gallery_dl/config.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2015 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Global configuration module"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import os.path
|
||||||
|
import platform
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# public interface
|
||||||
|
|
||||||
|
def load(*files):
|
||||||
|
"""Load JSON configuration files"""
|
||||||
|
configfiles = files or _default_configs
|
||||||
|
for conf in configfiles:
|
||||||
|
try:
|
||||||
|
path = os.path.expanduser(conf)
|
||||||
|
with open(path) as file:
|
||||||
|
confdict = json.load(file)
|
||||||
|
_config.update(confdict)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
except json.decoder.JSONDecodeError as exception:
|
||||||
|
print("Error while loading '", path, "':", sep="", file=sys.stderr)
|
||||||
|
print(exception, file=sys.stderr)
|
||||||
|
|
||||||
|
def clear():
|
||||||
|
"""Reset configuration to en empty state"""
|
||||||
|
globals()["_config"] = {}
|
||||||
|
|
||||||
|
def get(keys, default=None):
|
||||||
|
"""Get the value of property 'key' or a default-value if it doenst exist"""
|
||||||
|
conf = _config
|
||||||
|
try:
|
||||||
|
for k in keys:
|
||||||
|
conf = conf[k]
|
||||||
|
return conf
|
||||||
|
except (KeyError, AttributeError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
def interpolate(keys, default=None):
|
||||||
|
"""Interpolate the value of 'key'"""
|
||||||
|
conf = _config
|
||||||
|
try:
|
||||||
|
for k in keys:
|
||||||
|
default = conf.get(keys[-1], default)
|
||||||
|
conf = conf[k]
|
||||||
|
return conf
|
||||||
|
except (KeyError, AttributeError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
def set(keys, value):
|
||||||
|
"""Set the value of property 'key' for this session"""
|
||||||
|
conf = _config
|
||||||
|
for k in keys[:-1]:
|
||||||
|
try:
|
||||||
|
conf = conf[k]
|
||||||
|
except KeyError:
|
||||||
|
temp = {}
|
||||||
|
conf[k] = temp
|
||||||
|
conf = temp
|
||||||
|
conf[keys[-1]] = value
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------
|
||||||
|
# internals
|
||||||
|
|
||||||
|
_config = {}
|
||||||
|
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
_default_configs = [
|
||||||
|
r"~\.config\gallery-dl.conf",
|
||||||
|
r"~\.gallery-dl.conf",
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
_default_configs = [
|
||||||
|
"/etc/gallery-dl.conf",
|
||||||
|
"~/.config/gallery/config.json",
|
||||||
|
"~/.config/gallery-dl.conf",
|
||||||
|
"~/.gallery-dl.conf",
|
||||||
|
]
|
@ -11,12 +11,12 @@ import sys
|
|||||||
import importlib
|
import importlib
|
||||||
from . import extractor
|
from . import extractor
|
||||||
from .extractor.common import Message
|
from .extractor.common import Message
|
||||||
|
from . import config
|
||||||
|
|
||||||
class DownloadManager():
|
class DownloadManager():
|
||||||
|
|
||||||
def __init__(self, opts, config):
|
def __init__(self, opts):
|
||||||
self.opts = opts
|
self.opts = opts
|
||||||
self.config = config
|
|
||||||
self.modules = {}
|
self.modules = {}
|
||||||
|
|
||||||
def add(self, url):
|
def add(self, url):
|
||||||
@ -36,7 +36,7 @@ class DownloadManager():
|
|||||||
if self.opts.dest:
|
if self.opts.dest:
|
||||||
return self.opts.dest
|
return self.opts.dest
|
||||||
else:
|
else:
|
||||||
return self.config.get("general", "destination", fallback="/tmp/")
|
return config.get(("base-directory",), default="/tmp/")
|
||||||
|
|
||||||
|
|
||||||
class DownloadJob():
|
class DownloadJob():
|
||||||
@ -48,16 +48,14 @@ class DownloadJob():
|
|||||||
return
|
return
|
||||||
self.directory = mngr.get_base_directory()
|
self.directory = mngr.get_base_directory()
|
||||||
self.downloaders = {}
|
self.downloaders = {}
|
||||||
self.filename_fmt = mngr.config.get(
|
self.filename_fmt = config.get(
|
||||||
self.info["category"], "filename",
|
("extractor", self.info["category"], "filename"),
|
||||||
fallback=self.info["filename"]
|
default=self.info["filename"]
|
||||||
|
)
|
||||||
|
segments = config.get(
|
||||||
|
("extractor", self.info["category"], "directory"),
|
||||||
|
default=self.info["directory"]
|
||||||
)
|
)
|
||||||
try:
|
|
||||||
segments = mngr.config.get(
|
|
||||||
self.info["category"], "directory"
|
|
||||||
).split("/")
|
|
||||||
except Exception:
|
|
||||||
segments = self.info["directory"]
|
|
||||||
self.directory_fmt = os.path.join(*segments)
|
self.directory_fmt = os.path.join(*segments)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
@ -112,13 +110,11 @@ class DownloadJob():
|
|||||||
scheme = url[:pos] if pos != -1 else "http"
|
scheme = url[:pos] if pos != -1 else "http"
|
||||||
if scheme == "https":
|
if scheme == "https":
|
||||||
scheme = "http"
|
scheme = "http"
|
||||||
|
|
||||||
downloader = self.downloaders.get(scheme)
|
downloader = self.downloaders.get(scheme)
|
||||||
if downloader is None:
|
if downloader is None:
|
||||||
module = self.mngr.get_downloader_module(scheme)
|
module = self.mngr.get_downloader_module(scheme)
|
||||||
downloader = module.Downloader()
|
downloader = module.Downloader()
|
||||||
self.downloaders[scheme] = downloader
|
self.downloaders[scheme] = downloader
|
||||||
|
|
||||||
return downloader
|
return downloader
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -22,8 +22,8 @@ info = {
|
|||||||
|
|
||||||
class ThreeDeeBooruExtractor(JSONBooruExtractor):
|
class ThreeDeeBooruExtractor(JSONBooruExtractor):
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
JSONBooruExtractor.__init__(self, match, config, info)
|
JSONBooruExtractor.__init__(self, match, info)
|
||||||
self.api_url = "http://behoimi.org/post/index.json"
|
self.api_url = "http://behoimi.org/post/index.json"
|
||||||
self.headers = {
|
self.headers = {
|
||||||
"Referer": "http://behoimi.org/post/show/",
|
"Referer": "http://behoimi.org/post/show/",
|
||||||
|
@ -8,65 +8,25 @@
|
|||||||
|
|
||||||
"""Extract image- and video-urls from threads on https://www.4chan.org/"""
|
"""Extract image- and video-urls from threads on https://www.4chan.org/"""
|
||||||
|
|
||||||
from .common import SequentialExtractor, Message
|
from .chan import ChanExtractor
|
||||||
from urllib.parse import unquote
|
|
||||||
import re
|
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
"category": "4chan",
|
"category": "4chan",
|
||||||
"extractor": "FourChanExtractor",
|
"extractor": "FourChanExtractor",
|
||||||
"directory": ["{category}", "{board}-{thread-id}"],
|
"directory": ["{category}", "{board}-{thread}"],
|
||||||
"filename": "{timestamp}-{name}",
|
"filename": "{tim}-{filename}{ext}",
|
||||||
"pattern": [
|
"pattern": [
|
||||||
r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
|
r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
class FourChanExtractor(SequentialExtractor):
|
class FourChanExtractor(ChanExtractor):
|
||||||
|
|
||||||
url_fmt = "https://boards.4chan.org/{0}/res/{1}.html"
|
api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
|
||||||
regex = (
|
file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
|
||||||
r'<a (?:title="(?P<orig_name>[^"]+)" )?href="'
|
|
||||||
r'(?P<url>//i.4cdn.org/[^/]+/(?P<timestamp>\d+)\.(?P<extension>[^"]+))'
|
|
||||||
r'" target="_blank">(?P<name>[^<]+)</a> '
|
|
||||||
r'\((?P<size>[^,]+), (?P<width>\d+)x(?P<height>\d+)\)'
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
SequentialExtractor.__init__(self, config)
|
ChanExtractor.__init__(
|
||||||
self.match = match
|
self, info["category"],
|
||||||
self.metadata = None
|
match.group(1), match.group(2)
|
||||||
|
)
|
||||||
def items(self):
|
|
||||||
yield Message.Version, 1
|
|
||||||
|
|
||||||
url = self.url_fmt.format(*self.match.groups())
|
|
||||||
text = self.request(url).text
|
|
||||||
self.metadata = self.get_job_metadata(text)
|
|
||||||
|
|
||||||
yield Message.Directory, self.metadata
|
|
||||||
for match in re.finditer(self.regex, text):
|
|
||||||
yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
|
|
||||||
|
|
||||||
def get_job_metadata(self, text):
|
|
||||||
"""Collect metadata for extractor-job"""
|
|
||||||
board, thread_id = self.match.groups()
|
|
||||||
title, _ = self.extract(text, '"description" content="', ' - "/')
|
|
||||||
return {
|
|
||||||
"category": info["category"],
|
|
||||||
"board": board,
|
|
||||||
"thread-id": thread_id,
|
|
||||||
"title": unquote(title),
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_file_metadata(self, match):
|
|
||||||
"""Collect metadata for a downloadable file"""
|
|
||||||
data = self.metadata
|
|
||||||
data.update(match.groupdict(default=""))
|
|
||||||
data["name"] = unquote(data["orig_name"] or data["name"])
|
|
||||||
return data
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_file_url(match):
|
|
||||||
"""Extract download-url from 'match'"""
|
|
||||||
return "https:" + match.group("url")
|
|
||||||
|
@ -8,65 +8,25 @@
|
|||||||
|
|
||||||
"""Extract image- and video-urls from threads on https://8ch.net/"""
|
"""Extract image- and video-urls from threads on https://8ch.net/"""
|
||||||
|
|
||||||
from .common import SequentialExtractor, Message
|
from .chan import ChanExtractor
|
||||||
from urllib.parse import unquote
|
|
||||||
import re
|
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
"category": "8chan",
|
"category": "8chan",
|
||||||
"extractor": "InfinityChanExtractor",
|
"extractor": "InfinityChanExtractor",
|
||||||
"directory": ["{category}", "{board}-{thread-id}"],
|
"directory": ["{category}", "{board}-{thread}"],
|
||||||
"filename": "{timestamp}-{name}",
|
"filename": "{tim}-{filename}{ext}",
|
||||||
"pattern": [
|
"pattern": [
|
||||||
r"(?:https?://)?(?:www\.)?(?:8chan\.co|8ch\.net)/([^/]+/res/\d+).*",
|
r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
class InfinityChanExtractor(SequentialExtractor):
|
class InfinityChanExtractor(ChanExtractor):
|
||||||
|
|
||||||
url_base = "https://8ch.net"
|
api_url = "https://8ch.net/{board}/res/{thread}.json"
|
||||||
url_fmt = url_base + "/{board}/res/{thread-id}.html"
|
file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
|
||||||
regex = (
|
|
||||||
r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?'
|
|
||||||
r'<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, match, config):
|
|
||||||
SequentialExtractor.__init__(self, config)
|
|
||||||
self.match = match
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
yield Message.Version, 1
|
|
||||||
|
|
||||||
metadata = self.get_job_metadata()
|
|
||||||
yield Message.Directory, metadata
|
|
||||||
|
|
||||||
url = self.url_fmt.format(**metadata)
|
|
||||||
text = self.request(url).text
|
|
||||||
for match in re.finditer(self.regex, text):
|
|
||||||
yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
|
|
||||||
|
|
||||||
def get_job_metadata(self):
|
|
||||||
"""Collect metadata for extractor-job"""
|
|
||||||
board, _, thread_id = self.match.group(1).split("/")
|
|
||||||
return {
|
|
||||||
"category": info["category"],
|
|
||||||
"board": board,
|
|
||||||
"thread-id": thread_id,
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_file_metadata(match):
|
|
||||||
"""Collect metadata for a downloadable file"""
|
|
||||||
return {
|
|
||||||
"timestamp": match.group(2),
|
|
||||||
"name": unquote(match.group(4) or match.group(5)),
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_file_url(self, match):
|
|
||||||
"""Extract download-url from 'match'"""
|
|
||||||
url = match.group(1)
|
|
||||||
if url.startswith("/"):
|
|
||||||
url = self.url_base + url
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
ChanExtractor.__init__(
|
||||||
|
self, info["category"],
|
||||||
|
match.group(1), match.group(2)
|
||||||
|
)
|
||||||
|
@ -8,10 +8,8 @@
|
|||||||
|
|
||||||
"""Extract manga pages from http://bato.to/"""
|
"""Extract manga pages from http://bato.to/"""
|
||||||
|
|
||||||
from .common import AsynchronousExtractor
|
from .common import AsynchronousExtractor, Message
|
||||||
from .common import Message
|
from .. import text
|
||||||
from .common import filename_from_url, unescape
|
|
||||||
from urllib.parse import unquote
|
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@ -29,8 +27,8 @@ class BatotoExtractor(AsynchronousExtractor):
|
|||||||
|
|
||||||
url_base = "http://bato.to/read/_/"
|
url_base = "http://bato.to/read/_/"
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
AsynchronousExtractor.__init__(self, config)
|
AsynchronousExtractor.__init__(self)
|
||||||
self.chapter_id = match.group(1)
|
self.chapter_id = match.group(1)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor):
|
|||||||
def get_page_metadata(self, page_url):
|
def get_page_metadata(self, page_url):
|
||||||
"""Collect next url and metadata for one manga-page"""
|
"""Collect next url and metadata for one manga-page"""
|
||||||
page = self.request(page_url).text
|
page = self.request(page_url).text
|
||||||
_ , pos = self.extract(page, 'selected="selected"', '')
|
_ , pos = text.extract(page, 'selected="selected"', '')
|
||||||
title, pos = self.extract(page, ': ', '<', pos)
|
title, pos = text.extract(page, ': ', '<', pos)
|
||||||
_ , pos = self.extract(page, 'selected="selected"', '', pos)
|
_ , pos = text.extract(page, 'selected="selected"', '', pos)
|
||||||
trans, pos = self.extract(page, '>', '<', pos)
|
trans, pos = text.extract(page, '>', '<', pos)
|
||||||
_ , pos = self.extract(page, '<div id="full_image"', '', pos)
|
_ , pos = text.extract(page, '<div id="full_image"', '', pos)
|
||||||
image, pos = self.extract(page, '<img src="', '"', pos)
|
image, pos = text.extract(page, '<img src="', '"', pos)
|
||||||
url , pos = self.extract(page, '<a href="', '"', pos)
|
url , pos = text.extract(page, '<a href="', '"', pos)
|
||||||
mmatch = re.search(
|
mmatch = re.search(
|
||||||
r"<title>(.+) - (?:vol (\d+) )?"
|
r"<title>(.+) - (?:vol (\d+) )?"
|
||||||
r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>",
|
r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>",
|
||||||
@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor):
|
|||||||
r"(.+) - ([^ ]+)",
|
r"(.+) - ([^ ]+)",
|
||||||
trans
|
trans
|
||||||
)
|
)
|
||||||
filename = unquote(filename_from_url(image))
|
filename = text.unquote(text.filename_from_url(image))
|
||||||
name, ext = os.path.splitext(filename)
|
name, ext = os.path.splitext(filename)
|
||||||
return url, {
|
return url, {
|
||||||
"category": info["category"],
|
"category": info["category"],
|
||||||
"chapter-id": self.chapter_id,
|
"chapter-id": self.chapter_id,
|
||||||
"manga": unescape(mmatch.group(1)),
|
"manga": text.unescape(mmatch.group(1)),
|
||||||
"volume": mmatch.group(2) or "",
|
"volume": mmatch.group(2) or "",
|
||||||
"chapter": mmatch.group(3),
|
"chapter": mmatch.group(3),
|
||||||
"page": mmatch.group(4),
|
"page": mmatch.group(4),
|
||||||
"group": tmatch.group(1),
|
"group": tmatch.group(1),
|
||||||
"language": tmatch.group(2),
|
"language": tmatch.group(2),
|
||||||
"title": unescape(title),
|
"title": text.unescape(title),
|
||||||
"image-url": image,
|
"image-url": image,
|
||||||
"name": name,
|
"name": name,
|
||||||
"extension": ext[1:],
|
"extension": ext[1:],
|
||||||
|
@ -8,23 +8,21 @@
|
|||||||
|
|
||||||
"""Base classes for extractors for danbooru and co"""
|
"""Base classes for extractors for danbooru and co"""
|
||||||
|
|
||||||
from .common import SequentialExtractor
|
from .common import SequentialExtractor, Message
|
||||||
from .common import Message
|
from .. import text
|
||||||
from .common import filename_from_url
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import json
|
import json
|
||||||
import os.path
|
import os.path
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
|
|
||||||
class BooruExtractor(SequentialExtractor):
|
class BooruExtractor(SequentialExtractor):
|
||||||
|
|
||||||
api_url = ""
|
api_url = ""
|
||||||
|
|
||||||
def __init__(self, match, config, info):
|
def __init__(self, match, info):
|
||||||
SequentialExtractor.__init__(self, config)
|
SequentialExtractor.__init__(self)
|
||||||
self.info = info
|
self.info = info
|
||||||
self.tags = urllib.parse.unquote(match.group(1))
|
self.tags = text.unquote(match.group(1))
|
||||||
self.page = "page"
|
self.page = "page"
|
||||||
self.params = {"tags": self.tags}
|
self.params = {"tags": self.tags}
|
||||||
self.headers = {}
|
self.headers = {}
|
||||||
@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor):
|
|||||||
def get_file_metadata(self, data):
|
def get_file_metadata(self, data):
|
||||||
"""Collect metadata for a downloadable file"""
|
"""Collect metadata for a downloadable file"""
|
||||||
data["category"] = self.info["category"]
|
data["category"] = self.info["category"]
|
||||||
data["name"] = urllib.parse.unquote(
|
data["name"] = text.unquote(
|
||||||
filename_from_url(self.get_file_url(data))
|
text.filename_from_url(self.get_file_url(data))
|
||||||
)
|
)
|
||||||
data["extension"] = os.path.splitext(data["name"])[1][1:]
|
data["extension"] = os.path.splitext(data["name"])[1][1:]
|
||||||
return data
|
return data
|
||||||
|
47
gallery_dl/extractor/chan.py
Normal file
47
gallery_dl/extractor/chan.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2015 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Base classes for extractors for different Futaba Channel boards"""
|
||||||
|
|
||||||
|
from .common import SequentialExtractor, Message
|
||||||
|
from .. import text
|
||||||
|
|
||||||
|
class ChanExtractor(SequentialExtractor):
|
||||||
|
|
||||||
|
api_url = ""
|
||||||
|
file_url = ""
|
||||||
|
|
||||||
|
def __init__(self, category, board, thread):
|
||||||
|
SequentialExtractor.__init__(self)
|
||||||
|
self.metadata = {
|
||||||
|
"category": category,
|
||||||
|
"board": board,
|
||||||
|
"thread": thread,
|
||||||
|
}
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
yield Message.Version, 1
|
||||||
|
posts = self.request(self.api_url.format(**self.metadata)).json()["posts"]
|
||||||
|
self.metadata["title"] = self.get_thread_title(posts[0])
|
||||||
|
yield Message.Directory, self.metadata
|
||||||
|
for post in posts:
|
||||||
|
if "filename" not in post:
|
||||||
|
continue
|
||||||
|
post.update(self.metadata)
|
||||||
|
yield Message.Url, self.file_url.format(**post), post
|
||||||
|
if "extra_files" in post:
|
||||||
|
for file in post["extra_files"]:
|
||||||
|
post.update(file)
|
||||||
|
yield Message.Url, self.file_url.format(**post), post
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_thread_title(post):
|
||||||
|
"""Return thread title from first post"""
|
||||||
|
if "sub" in post:
|
||||||
|
return post["sub"]
|
||||||
|
return text.remove_html(post["com"])[:50]
|
@ -12,7 +12,7 @@ import time
|
|||||||
import queue
|
import queue
|
||||||
import requests
|
import requests
|
||||||
import threading
|
import threading
|
||||||
import html.parser
|
from .. import config
|
||||||
|
|
||||||
|
|
||||||
class Message():
|
class Message():
|
||||||
@ -44,36 +44,18 @@ class Extractor():
|
|||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
|
"Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def extract(txt, begin, end, pos=0):
|
|
||||||
try:
|
|
||||||
first = txt.index(begin, pos) + len(begin)
|
|
||||||
last = txt.index(end, first)
|
|
||||||
return txt[first:last], last+len(end)
|
|
||||||
except ValueError:
|
|
||||||
return None, pos
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def extract_all(txt, begin, end, pos=0):
|
|
||||||
try:
|
|
||||||
first = txt.index(begin, pos)
|
|
||||||
last = txt.index(end, first + len(begin)) + len(end)
|
|
||||||
return txt[first:last], last
|
|
||||||
except ValueError:
|
|
||||||
return None, pos
|
|
||||||
|
|
||||||
|
|
||||||
class SequentialExtractor(Extractor):
|
class SequentialExtractor(Extractor):
|
||||||
|
|
||||||
def __init__(self, _):
|
def __init__(self):
|
||||||
Extractor.__init__(self)
|
Extractor.__init__(self)
|
||||||
|
|
||||||
|
|
||||||
class AsynchronousExtractor(Extractor):
|
class AsynchronousExtractor(Extractor):
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self):
|
||||||
Extractor.__init__(self)
|
Extractor.__init__(self)
|
||||||
queue_size = int(config.get("general", "queue-size", fallback=5))
|
queue_size = int(config.get(("queue-size",), default=5))
|
||||||
self.__queue = queue.Queue(maxsize=queue_size)
|
self.__queue = queue.Queue(maxsize=queue_size)
|
||||||
self.__thread = threading.Thread(target=self.async_items, daemon=True)
|
self.__thread = threading.Thread(target=self.async_items, daemon=True)
|
||||||
|
|
||||||
@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs):
|
|||||||
|
|
||||||
# everything ok -- proceed to download
|
# everything ok -- proceed to download
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def filename_from_url(url):
|
|
||||||
pos = url.rfind("/")
|
|
||||||
return url[pos+1:]
|
|
||||||
|
|
||||||
unescape = html.parser.HTMLParser().unescape
|
|
||||||
|
@ -22,6 +22,6 @@ info = {
|
|||||||
|
|
||||||
class DanbooruExtractor(JSONBooruExtractor):
|
class DanbooruExtractor(JSONBooruExtractor):
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
JSONBooruExtractor.__init__(self, match, config, info)
|
JSONBooruExtractor.__init__(self, match, info)
|
||||||
self.api_url = "https://danbooru.donmai.us/posts.json"
|
self.api_url = "https://danbooru.donmai.us/posts.json"
|
||||||
|
@ -23,6 +23,6 @@ info = {
|
|||||||
|
|
||||||
class E621Extractor(JSONBooruExtractor):
|
class E621Extractor(JSONBooruExtractor):
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
JSONBooruExtractor.__init__(self, match, config, info)
|
JSONBooruExtractor.__init__(self, match, info)
|
||||||
self.api_url = "https://e621.net/post/index.json"
|
self.api_url = "https://e621.net/post/index.json"
|
||||||
|
@ -22,8 +22,8 @@ info = {
|
|||||||
|
|
||||||
class GelbooruExtractor(XMLBooruExtractor):
|
class GelbooruExtractor(XMLBooruExtractor):
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
XMLBooruExtractor.__init__(self, match, config, info)
|
XMLBooruExtractor.__init__(self, match, info)
|
||||||
self.api_url = "http://gelbooru.com/"
|
self.api_url = "http://gelbooru.com/"
|
||||||
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
|
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
|
||||||
|
|
||||||
|
@ -8,9 +8,8 @@
|
|||||||
|
|
||||||
"""Extract images from galleries at http://www.imagebam.com/"""
|
"""Extract images from galleries at http://www.imagebam.com/"""
|
||||||
|
|
||||||
from .common import AsynchronousExtractor
|
from .common import AsynchronousExtractor, Message
|
||||||
from .common import Message
|
from .. import text
|
||||||
from .common import filename_from_url
|
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
"category": "imagebam",
|
"category": "imagebam",
|
||||||
@ -26,8 +25,8 @@ class ImagebamExtractor(AsynchronousExtractor):
|
|||||||
|
|
||||||
url_base = "http://www.imagebam.com"
|
url_base = "http://www.imagebam.com"
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
AsynchronousExtractor.__init__(self, config)
|
AsynchronousExtractor.__init__(self)
|
||||||
self.match = match
|
self.match = match
|
||||||
self.num = 0
|
self.num = 0
|
||||||
self.metadata = {}
|
self.metadata = {}
|
||||||
@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor):
|
|||||||
done = False
|
done = False
|
||||||
while not done:
|
while not done:
|
||||||
# get current page
|
# get current page
|
||||||
text = self.request(self.url_base + next_url).text
|
page = self.request(self.url_base + next_url).text
|
||||||
|
|
||||||
# get url for next page
|
# get url for next page
|
||||||
next_url, pos = self.extract(text, "<a class='buttonblue' href='", "'")
|
next_url, pos = text.extract(page, "<a class='buttonblue' href='", "'")
|
||||||
|
|
||||||
# if the following text isn't "><span>next image" we are done
|
# if the following text isn't "><span>next image" we are done
|
||||||
if not text.startswith("><span>next image", pos):
|
if not page.startswith("><span>next image", pos):
|
||||||
done = True
|
done = True
|
||||||
|
|
||||||
# get image url
|
# get image url
|
||||||
img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos)
|
img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)
|
||||||
|
|
||||||
yield Message.Url, img_url, self.get_file_metadata(img_url)
|
yield Message.Url, img_url, self.get_file_metadata(img_url)
|
||||||
|
|
||||||
def get_job_metadata(self):
|
def get_job_metadata(self):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
gallery_key = self.match.group(2)
|
gallery_key = self.match.group(2)
|
||||||
text = self.request(self.url_base + "/gallery/" + gallery_key).text
|
page = self.request(self.url_base + "/gallery/" + gallery_key).text
|
||||||
_ , pos = self.extract(text, "<img src='/img/icons/photos.png'", "")
|
_ , pos = text.extract(page, "<img src='/img/icons/photos.png'", "")
|
||||||
title, pos = self.extract(text, "'> ", " <", pos)
|
title, pos = text.extract(page, "'> ", " <", pos)
|
||||||
count, pos = self.extract(text, "'>", " images", pos)
|
count, pos = text.extract(page, "'>", " images", pos)
|
||||||
url , pos = self.extract(text, "<a href='http://www.imagebam.com", "'", pos)
|
url , pos = text.extract(page, "<a href='http://www.imagebam.com", "'", pos)
|
||||||
return {
|
return {
|
||||||
"category": info["category"],
|
"category": info["category"],
|
||||||
"key": gallery_key,
|
"key": gallery_key,
|
||||||
@ -77,5 +76,5 @@ class ImagebamExtractor(AsynchronousExtractor):
|
|||||||
self.num += 1
|
self.num += 1
|
||||||
data = self.metadata.copy()
|
data = self.metadata.copy()
|
||||||
data["num"] = self.num
|
data["num"] = self.num
|
||||||
data["name"] = filename_from_url(url)
|
data["name"] = text.filename_from_url(url)
|
||||||
return data
|
return data
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
"""Extract images from galleries at http://imgbox.com/"""
|
"""Extract images from galleries at http://imgbox.com/"""
|
||||||
|
|
||||||
from .common import AsynchronousExtractor, Message
|
from .common import AsynchronousExtractor, Message
|
||||||
|
from .. import text
|
||||||
import re
|
import re
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
@ -25,8 +26,8 @@ class ImgboxExtractor(AsynchronousExtractor):
|
|||||||
|
|
||||||
url_base = "http://imgbox.com"
|
url_base = "http://imgbox.com"
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
AsynchronousExtractor.__init__(self, config)
|
AsynchronousExtractor.__init__(self)
|
||||||
self.key = match.group(1)
|
self.key = match.group(1)
|
||||||
self.metadata = {}
|
self.metadata = {}
|
||||||
|
|
||||||
@ -36,8 +37,8 @@ class ImgboxExtractor(AsynchronousExtractor):
|
|||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
yield Message.Directory, self.metadata
|
yield Message.Directory, self.metadata
|
||||||
for match in re.finditer(r'<a href="([^"]+)"><img alt="', page):
|
for match in re.finditer(r'<a href="([^"]+)"><img alt="', page):
|
||||||
text = self.request(self.url_base + match.group(1)).text
|
imgpage = self.request(self.url_base + match.group(1)).text
|
||||||
yield Message.Url, self.get_file_url(text), self.get_file_metadata(text)
|
yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)
|
||||||
|
|
||||||
def get_job_metadata(self, page):
|
def get_job_metadata(self, page):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
@ -51,16 +52,16 @@ class ImgboxExtractor(AsynchronousExtractor):
|
|||||||
"count": match.group(4),
|
"count": match.group(4),
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_file_metadata(self, text):
|
def get_file_metadata(self, page):
|
||||||
"""Collect metadata for a downloadable file"""
|
"""Collect metadata for a downloadable file"""
|
||||||
data = self.metadata.copy()
|
data = self.metadata.copy()
|
||||||
data["num"] , pos = self.extract(text, '</a> ', ' of ')
|
data["num"] , pos = text.extract(page, '</a> ', ' of ')
|
||||||
data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos)
|
data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos)
|
||||||
data["name"] , pos = self.extract(text, ' title="', '"', pos)
|
data["name"] , pos = text.extract(page, ' title="', '"', pos)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_file_url(self, text):
|
def get_file_url(self, page):
|
||||||
"""Extract download-url"""
|
"""Extract download-url"""
|
||||||
base = "http://i.imgbox.com/"
|
base = "http://i.imgbox.com/"
|
||||||
path, _ = self.extract(text, base, '"')
|
path, _ = text.extract(page, base, '"')
|
||||||
return base + path
|
return base + path
|
||||||
|
@ -8,9 +8,8 @@
|
|||||||
|
|
||||||
"""Extract images from albums at http://imgchili.net/"""
|
"""Extract images from albums at http://imgchili.net/"""
|
||||||
|
|
||||||
from .common import SequentialExtractor
|
from .common import SequentialExtractor, Message
|
||||||
from .common import Message
|
from .. import text
|
||||||
from .common import filename_from_url
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
@ -25,8 +24,8 @@ info = {
|
|||||||
|
|
||||||
class ImgchiliExtractor(SequentialExtractor):
|
class ImgchiliExtractor(SequentialExtractor):
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
SequentialExtractor.__init__(self, config)
|
SequentialExtractor.__init__(self)
|
||||||
self.match = match
|
self.match = match
|
||||||
self.num = 0
|
self.num = 0
|
||||||
|
|
||||||
@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor):
|
|||||||
|
|
||||||
def get_job_metadata(self, page):
|
def get_job_metadata(self, page):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
title = self.extract(page, "<h1>", "</h1>")[0]
|
title = text.extract(page, "<h1>", "</h1>")[0]
|
||||||
return {
|
return {
|
||||||
"category": info["category"],
|
"category": info["category"],
|
||||||
"title": title,
|
"title": title,
|
||||||
|
@ -8,10 +8,8 @@
|
|||||||
|
|
||||||
"""Extract manga pages from http://www.mangareader.net/"""
|
"""Extract manga pages from http://www.mangareader.net/"""
|
||||||
|
|
||||||
from .common import AsynchronousExtractor
|
from .common import AsynchronousExtractor, Message
|
||||||
from .common import Message
|
from .. import text
|
||||||
from .common import unescape, filename_from_url
|
|
||||||
from urllib.parse import unquote
|
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@ -30,8 +28,8 @@ class MangaReaderExtractor(AsynchronousExtractor):
|
|||||||
|
|
||||||
url_base = "http://www.mangareader.net"
|
url_base = "http://www.mangareader.net"
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
AsynchronousExtractor.__init__(self, config)
|
AsynchronousExtractor.__init__(self)
|
||||||
self.part = match.group(1)
|
self.part = match.group(1)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor):
|
|||||||
def get_page_metadata(self, page_url):
|
def get_page_metadata(self, page_url):
|
||||||
"""Collect next url, image-url and metadata for one manga-page"""
|
"""Collect next url, image-url and metadata for one manga-page"""
|
||||||
page = self.request(page_url).text
|
page = self.request(page_url).text
|
||||||
extr = self.extract
|
extr = text.extract
|
||||||
width = None
|
width = None
|
||||||
descr, pos = extr(page, '<meta name="description" content="', '"')
|
descr, pos = extr(page, '<meta name="description" content="', '"')
|
||||||
test , pos = extr(page, "document['pu']", '', pos)
|
test , pos = extr(page, "document['pu']", '', pos)
|
||||||
@ -62,13 +60,13 @@ class MangaReaderExtractor(AsynchronousExtractor):
|
|||||||
width , pos = extr(page, '<img id="img" width="', '"', pos)
|
width , pos = extr(page, '<img id="img" width="', '"', pos)
|
||||||
height, pos = extr(page, ' height="', '"', pos)
|
height, pos = extr(page, ' height="', '"', pos)
|
||||||
image, pos = extr(page, ' src="', '"', pos)
|
image, pos = extr(page, ' src="', '"', pos)
|
||||||
filename = unquote(filename_from_url(image))
|
filename = text.unquote(text.filename_from_url(image))
|
||||||
name, ext = os.path.splitext(filename)
|
name, ext = os.path.splitext(filename)
|
||||||
match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr)
|
match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr)
|
||||||
|
|
||||||
return self.url_base + url, image, {
|
return self.url_base + url, image, {
|
||||||
"category": info["category"],
|
"category": info["category"],
|
||||||
"manga": unescape(match.group(1)),
|
"manga": text.unescape(match.group(1)),
|
||||||
"chapter": match.group(2),
|
"chapter": match.group(2),
|
||||||
"page": match.group(3),
|
"page": match.group(3),
|
||||||
"width": width,
|
"width": width,
|
||||||
|
@ -8,9 +8,8 @@
|
|||||||
|
|
||||||
"""Extract images from https://nijie.info/"""
|
"""Extract images from https://nijie.info/"""
|
||||||
|
|
||||||
from .common import AsynchronousExtractor
|
from .common import AsynchronousExtractor, Message
|
||||||
from .common import Message
|
from .. import config, text
|
||||||
from .common import filename_from_url
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
@ -27,8 +26,8 @@ class NijieExtractor(AsynchronousExtractor):
|
|||||||
|
|
||||||
popup_url = "https://nijie.info/view_popup.php?id="
|
popup_url = "https://nijie.info/view_popup.php?id="
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
AsynchronousExtractor.__init__(self, config)
|
AsynchronousExtractor.__init__(self)
|
||||||
self.artist_id = match.group(1)
|
self.artist_id = match.group(1)
|
||||||
self.artist_url = (
|
self.artist_url = (
|
||||||
"https://nijie.info/members_illust.php?id="
|
"https://nijie.info/members_illust.php?id="
|
||||||
@ -37,7 +36,9 @@ class NijieExtractor(AsynchronousExtractor):
|
|||||||
self.session.headers["Referer"] = self.artist_url
|
self.session.headers["Referer"] = self.artist_url
|
||||||
self.session.cookies["R18"] = "1"
|
self.session.cookies["R18"] = "1"
|
||||||
self.session.cookies["nijie_referer"] = "nijie.info"
|
self.session.cookies["nijie_referer"] = "nijie.info"
|
||||||
self.session.cookies.update(config["nijie-cookies"])
|
self.session.cookies.update(
|
||||||
|
config.get(("extractor", info["category"], "cookies"))
|
||||||
|
)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
data = self.get_job_metadata()
|
data = self.get_job_metadata()
|
||||||
@ -56,19 +57,20 @@ class NijieExtractor(AsynchronousExtractor):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def get_image_ids(self):
|
def get_image_ids(self):
|
||||||
text = self.request(self.artist_url).text
|
"""Collect all image-ids for a specific artist"""
|
||||||
|
page = self.request(self.artist_url).text
|
||||||
regex = r'<a href="/view\.php\?id=(\d+)"'
|
regex = r'<a href="/view\.php\?id=(\d+)"'
|
||||||
return [m.group(1) for m in re.finditer(regex, text)]
|
return [m.group(1) for m in re.finditer(regex, page)]
|
||||||
|
|
||||||
def get_image_data(self, image_id):
|
def get_image_data(self, image_id):
|
||||||
"""Get URL and metadata for images specified by 'image_id'"""
|
"""Get URL and metadata for images specified by 'image_id'"""
|
||||||
text = self.request(self.popup_url + image_id).text
|
page = self.request(self.popup_url + image_id).text
|
||||||
matches = re.findall('<img src="([^"]+)"', text)
|
matches = re.findall('<img src="([^"]+)"', page)
|
||||||
for index, url in enumerate(matches):
|
for index, url in enumerate(matches):
|
||||||
yield "https:" + url, {
|
yield "https:" + url, {
|
||||||
"count": len(matches),
|
"count": len(matches),
|
||||||
"index": index,
|
"index": index,
|
||||||
"image-id": image_id,
|
"image-id": image_id,
|
||||||
"name" : filename_from_url(url),
|
"name" : text.filename_from_url(url),
|
||||||
"extension": url[url.rfind(".")+1:],
|
"extension": url[url.rfind(".")+1:],
|
||||||
}
|
}
|
||||||
|
@ -8,8 +8,8 @@
|
|||||||
|
|
||||||
"""Extract images and ugoira from http://www.pixiv.net/"""
|
"""Extract images and ugoira from http://www.pixiv.net/"""
|
||||||
|
|
||||||
from .common import SequentialExtractor
|
from .common import SequentialExtractor, Message
|
||||||
from .common import Message
|
from .. import config, text
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@ -29,16 +29,15 @@ class PixivExtractor(SequentialExtractor):
|
|||||||
member_url = "http://www.pixiv.net/member_illust.php"
|
member_url = "http://www.pixiv.net/member_illust.php"
|
||||||
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
|
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
SequentialExtractor.__init__(self, config)
|
SequentialExtractor.__init__(self)
|
||||||
self.config = config
|
|
||||||
self.artist_id = match.group(1)
|
self.artist_id = match.group(1)
|
||||||
self.api = PixivAPI(self.session)
|
self.api = PixivAPI(self.session)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
self.api.login(
|
self.api.login(
|
||||||
self.config.get("pixiv", "username"),
|
config.get(("extractor", "pixiv", "username")),
|
||||||
self.config.get("pixiv", "password"),
|
config.get(("extractor", "pixiv", "password")),
|
||||||
)
|
)
|
||||||
metadata = self.get_job_metadata()
|
metadata = self.get_job_metadata()
|
||||||
|
|
||||||
@ -84,9 +83,9 @@ class PixivExtractor(SequentialExtractor):
|
|||||||
|
|
||||||
def get_works(self):
|
def get_works(self):
|
||||||
"""Yield all work-items for a pixiv-member"""
|
"""Yield all work-items for a pixiv-member"""
|
||||||
page = 1
|
pagenum = 1
|
||||||
while True:
|
while True:
|
||||||
data = self.api.user_works(self.artist_id, page)
|
data = self.api.user_works(self.artist_id, pagenum)
|
||||||
for work in data["response"]:
|
for work in data["response"]:
|
||||||
url = work["image_urls"]["large"]
|
url = work["image_urls"]["large"]
|
||||||
work["num"] = ""
|
work["num"] = ""
|
||||||
@ -96,17 +95,17 @@ class PixivExtractor(SequentialExtractor):
|
|||||||
pinfo = data["pagination"]
|
pinfo = data["pagination"]
|
||||||
if pinfo["current"] == pinfo["pages"]:
|
if pinfo["current"] == pinfo["pages"]:
|
||||||
return
|
return
|
||||||
page = pinfo["next"]
|
pagenum = pinfo["next"]
|
||||||
|
|
||||||
def parse_ugoira(self, data):
|
def parse_ugoira(self, data):
|
||||||
"""Parse ugoira data"""
|
"""Parse ugoira data"""
|
||||||
# get illust page
|
# get illust page
|
||||||
text = self.request(
|
page = self.request(
|
||||||
self.illust_url, params={"illust_id": data["id"]},
|
self.illust_url, params={"illust_id": data["id"]},
|
||||||
).text
|
).text
|
||||||
|
|
||||||
# parse page
|
# parse page
|
||||||
frames, _ = self.extract(text, ',"frames":[', ']')
|
frames, _ = text.extract(page, ',"frames":[', ']')
|
||||||
|
|
||||||
# build url
|
# build url
|
||||||
url = re.sub(
|
url = re.sub(
|
||||||
@ -146,7 +145,7 @@ class PixivAPI():
|
|||||||
self.session = session
|
self.session = session
|
||||||
self.session.headers.update({
|
self.session.headers.update({
|
||||||
"Referer": "http://www.pixiv.net/",
|
"Referer": "http://www.pixiv.net/",
|
||||||
"User-Agent": "PixivIOSApp/5.1.1",
|
"User-Agent": "PixivIOSApp/5.8.0",
|
||||||
# "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU",
|
# "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -8,9 +8,8 @@
|
|||||||
|
|
||||||
"""Extract manga pages from http://manga.redhawkscans.com/"""
|
"""Extract manga pages from http://manga.redhawkscans.com/"""
|
||||||
|
|
||||||
from .common import SequentialExtractor
|
from .common import SequentialExtractor, Message
|
||||||
from .common import Message
|
from .. import text
|
||||||
from .common import unescape
|
|
||||||
import os.path
|
import os.path
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
@ -29,8 +28,8 @@ class RedHawkScansExtractor(SequentialExtractor):
|
|||||||
|
|
||||||
url_base = "https://manga.redhawkscans.com/reader/read/"
|
url_base = "https://manga.redhawkscans.com/reader/read/"
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
SequentialExtractor.__init__(self, config)
|
SequentialExtractor.__init__(self)
|
||||||
self.part = match.group(1)
|
self.part = match.group(1)
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
@ -50,16 +49,16 @@ class RedHawkScansExtractor(SequentialExtractor):
|
|||||||
response = self.request(self.url_base + self.part)
|
response = self.request(self.url_base + self.part)
|
||||||
response.encoding = "utf-8"
|
response.encoding = "utf-8"
|
||||||
page = response.text
|
page = response.text
|
||||||
_ , pos = self.extract(page, '<h1 class="tbtitle dnone">', '')
|
_ , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
|
||||||
manga , pos = self.extract(page, 'title="', '"', pos)
|
manga , pos = text.extract(page, 'title="', '"', pos)
|
||||||
chapter , pos = self.extract(page, '">', '</a>', pos)
|
chapter , pos = text.extract(page, '">', '</a>', pos)
|
||||||
json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos)
|
json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos)
|
||||||
match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
|
match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
|
||||||
return {
|
return {
|
||||||
"category": info["category"],
|
"category": info["category"],
|
||||||
"manga": unescape(manga),
|
"manga": text.unescape(manga),
|
||||||
"chapter": match.group(2) or match.group(1),
|
"chapter": match.group(2) or match.group(1),
|
||||||
"chapter-minor": match.group(3) or "",
|
"chapter-minor": match.group(3) or "",
|
||||||
"language": "English",
|
"language": "English",
|
||||||
"title": unescape(match.group(4) or ""),
|
"title": text.unescape(match.group(4) or ""),
|
||||||
}, json.loads(json_data)
|
}, json.loads(json_data)
|
||||||
|
@ -22,6 +22,6 @@ info = {
|
|||||||
|
|
||||||
class YandereExtractor(JSONBooruExtractor):
|
class YandereExtractor(JSONBooruExtractor):
|
||||||
|
|
||||||
def __init__(self, match, config):
|
def __init__(self, match):
|
||||||
JSONBooruExtractor.__init__(self, match, config, info)
|
JSONBooruExtractor.__init__(self, match, info)
|
||||||
self.api_url = "https://yande.re/post.json"
|
self.api_url = "https://yande.re/post.json"
|
||||||
|
60
gallery_dl/text.py
Normal file
60
gallery_dl/text.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2015 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Collection of functions that work in strings/text"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import html.parser
|
||||||
|
import urllib.parse
|
||||||
|
import platform
|
||||||
|
|
||||||
|
def remove_html(text):
|
||||||
|
"""Remove html-tags from a string"""
|
||||||
|
return " ".join(re.sub("<[^>]+?>", " ", text).split())
|
||||||
|
|
||||||
|
def filename_from_url(url):
|
||||||
|
"""Extract the last part of an url to use as a filename"""
|
||||||
|
try:
|
||||||
|
path = urllib.parse.urlparse(url).path
|
||||||
|
pos = path.rindex("/")
|
||||||
|
return path[pos+1:]
|
||||||
|
except ValueError:
|
||||||
|
return url
|
||||||
|
|
||||||
|
def clean_path_windows(path):
|
||||||
|
"""Remove illegal characters from a path-segment (Windows)"""
|
||||||
|
return re.sub(r'[<>:"\\/|?*]', "_", path)
|
||||||
|
|
||||||
|
def clean_path_posix(path):
|
||||||
|
"""Remove illegal characters from a path-segment (Posix)"""
|
||||||
|
return path.replace("/", "_")
|
||||||
|
|
||||||
|
def extract(txt, begin, end, pos=0):
|
||||||
|
try:
|
||||||
|
first = txt.index(begin, pos) + len(begin)
|
||||||
|
last = txt.index(end, first)
|
||||||
|
return txt[first:last], last+len(end)
|
||||||
|
except ValueError:
|
||||||
|
return None, pos
|
||||||
|
|
||||||
|
def extract_all(txt, begin, end, pos=0):
|
||||||
|
try:
|
||||||
|
first = txt.index(begin, pos)
|
||||||
|
last = txt.index(end, first + len(begin)) + len(end)
|
||||||
|
return txt[first:last], last
|
||||||
|
except ValueError:
|
||||||
|
return None, pos
|
||||||
|
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
clean_path = clean_path_windows
|
||||||
|
else:
|
||||||
|
clean_path = clean_path_posix
|
||||||
|
|
||||||
|
unquote = urllib.parse.unquote
|
||||||
|
|
||||||
|
unescape = html.parser.HTMLParser().unescape
|
1
setup.py
1
setup.py
@ -46,4 +46,5 @@ setup(
|
|||||||
"Topic :: Multimedia",
|
"Topic :: Multimedia",
|
||||||
"Topic :: Multimedia :: Graphics",
|
"Topic :: Multimedia :: Graphics",
|
||||||
],
|
],
|
||||||
|
test_suite='test',
|
||||||
)
|
)
|
||||||
|
0
test/__init__.py
Normal file
0
test/__init__.py
Normal file
49
test/test_config.py
Normal file
49
test/test_config.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2015 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import gallery_dl.config as config
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
class TestConfig(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
fd, self._configfile = tempfile.mkstemp()
|
||||||
|
with os.fdopen(fd, "w") as file:
|
||||||
|
file.write('{"a": "1", "b": {"c": "text"}}')
|
||||||
|
config.load(self._configfile)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
config.clear()
|
||||||
|
os.remove(self._configfile)
|
||||||
|
|
||||||
|
def test_get(self):
|
||||||
|
self.assertEqual(config.get(["a"]), "1")
|
||||||
|
self.assertEqual(config.get(["b", "c"]), "text")
|
||||||
|
self.assertEqual(config.get(["d"]), None)
|
||||||
|
self.assertEqual(config.get(["e", "f", "g"], 123), 123)
|
||||||
|
|
||||||
|
def test_set(self):
|
||||||
|
config.set(["b", "c"], [1, 2, 3])
|
||||||
|
config.set(["e", "f", "g"], value=234)
|
||||||
|
self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
|
||||||
|
self.assertEqual(config.get(["e", "f", "g"]), 234)
|
||||||
|
|
||||||
|
def test_interpolate(self):
|
||||||
|
self.assertEqual(config.interpolate(["a"]), "1")
|
||||||
|
self.assertEqual(config.interpolate(["b", "a"]), "1")
|
||||||
|
self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
|
||||||
|
self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
|
||||||
|
config.set(["d"], 123)
|
||||||
|
self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
|
||||||
|
self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
51
test/test_text.py
Normal file
51
test/test_text.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2015 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import gallery_dl.text as text
|
||||||
|
|
||||||
|
class TestText(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_remove_html(self):
|
||||||
|
cases = (
|
||||||
|
"Hello World.",
|
||||||
|
" Hello World. ",
|
||||||
|
"Hello<br/>World.",
|
||||||
|
"<div><span class='a'>Hello</span><strong>World.</strong></div>"
|
||||||
|
)
|
||||||
|
result = "Hello World."
|
||||||
|
for case in cases:
|
||||||
|
self.assertEqual(text.remove_html(case), result)
|
||||||
|
|
||||||
|
def test_filename_from_url(self):
|
||||||
|
cases = (
|
||||||
|
"http://example.org/v2/filename.ext",
|
||||||
|
"http://example.org/v2/filename.ext?param=value#fragment",
|
||||||
|
"example.org/filename.ext",
|
||||||
|
"/filename.ext",
|
||||||
|
"filename.ext",
|
||||||
|
)
|
||||||
|
result = "filename.ext"
|
||||||
|
for case in cases:
|
||||||
|
self.assertEqual(text.filename_from_url(case), result)
|
||||||
|
|
||||||
|
def test_clean_path(self):
|
||||||
|
cases = {
|
||||||
|
"Hello World." : ("Hello World.", "Hello World."),
|
||||||
|
"Hello/World/.": ("Hello_World_.", "Hello_World_."),
|
||||||
|
r'<Hello>:|"World\*?': (
|
||||||
|
'_Hello____World___', r'<Hello>:|"World\*?'
|
||||||
|
),
|
||||||
|
}
|
||||||
|
for case, result in cases.items():
|
||||||
|
self.assertEqual(text.clean_path_windows(case), result[0])
|
||||||
|
self.assertEqual(text.clean_path_posix (case), result[1])
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Reference in New Issue
Block a user