From 29692c5784a61ef6cabaca8396e56c3851a6c17a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 30 Sep 2016 12:32:48 +0200 Subject: [PATCH] get extension from Content-Type header if not provided --- gallery_dl/downloader/common.py | 11 ++--- gallery_dl/downloader/http.py | 35 +++++++++----- gallery_dl/downloader/https.py | 1 - gallery_dl/downloader/text.py | 21 ++++++--- gallery_dl/job.py | 68 +++++++++------------------ gallery_dl/path.py | 82 +++++++++++++++++++++++++++++++++ 6 files changed, 146 insertions(+), 72 deletions(-) delete mode 100644 gallery_dl/downloader/https.py create mode 100644 gallery_dl/path.py diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index 8eed513a..6c3012a3 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014, 2015 Mike Fährmann +# Copyright 2014-2016 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,16 +15,15 @@ class BasicDownloader(): max_tries = 5 - def download(self, url, fileobj): + def download(self, url, pathfmt): """Download the resource at 'url' and write it to a file-like object""" try: - return self.download_impl(url, fileobj) + return self.download_impl(url, pathfmt) except: # remove file if download failed try: - fileobj.close() - os.unlink(fileobj.name) - except AttributeError: + os.unlink(pathfmt.realpath) + except (AttributeError, FileNotFoundError): pass raise diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index ab7e4b4b..f6936028 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014, 2015 Mike Fährmann +# Copyright 2014-2016 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,16 +10,17 @@ import time import requests +import mimetypes from .common import BasicDownloader class Downloader(BasicDownloader): - def __init__(self, printer): + def __init__(self, output): BasicDownloader.__init__(self) self.session = requests.session() - self.printer = printer + self.out = output - def download_impl(self, url, file): + def download_impl(self, url, pathfmt): tries = 0 while True: # try to connect to remote source @@ -27,7 +28,7 @@ class Downloader(BasicDownloader): response = self.session.get(url, stream=True, verify=True) except requests.exceptions.ConnectionError as exptn: tries += 1 - self.printer.error(file, exptn, tries, self.max_tries) + self.out.error(pathfmt.path, exptn, tries, self.max_tries) time.sleep(1) if tries == self.max_tries: raise @@ -36,10 +37,8 @@ class Downloader(BasicDownloader): # reject error-status-codes if response.status_code != requests.codes.ok: tries += 1 - self.printer.error(file, 'HTTP status "{} {}"'.format( + self.out.error(pathfmt.path, 'HTTP status "{} {}"'.format( response.status_code, response.reason), tries, self.max_tries) - if response.status_code == 404: - return self.max_tries time.sleep(1) if tries == self.max_tries: response.raise_for_status() @@ -48,9 +47,22 @@ class Downloader(BasicDownloader): # everything ok -- proceed to download break - for data in response.iter_content(16384): - file.write(data) - return tries + if not pathfmt.has_extension: + # set 'extension' keyword from Content-Type header + mtype = response.headers.get("Content-Type", "image/jpeg") + extensions = mimetypes.guess_all_extensions(mtype) + extensions.sort() + pathfmt.set_extension(extensions[-1][1:]) + if pathfmt.exists(): + self.out.skip(pathfmt.path) + response.close() + return + + self.out.start(pathfmt.path) + with pathfmt.open() as file: + for data in response.iter_content(16384): + file.write(data) + self.out.success(pathfmt.path, tries) def set_headers(self, headers): """Set headers for http requests""" @@ -65,4 +77,3 @@ class Downloader(BasicDownloader): """Copy the contents of dictionary 'src' to 'dest'""" dest.clear() dest.update(src) - diff --git a/gallery_dl/downloader/https.py b/gallery_dl/downloader/https.py deleted file mode 100644 index a60160e0..00000000 --- a/gallery_dl/downloader/https.py +++ /dev/null @@ -1 +0,0 @@ -from .http import Downloader diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py index e47f3882..fdfd685f 100644 --- a/gallery_dl/downloader/text.py +++ b/gallery_dl/downloader/text.py @@ -1,20 +1,29 @@ # -*- coding: utf-8 -*- -# Copyright 2014, 2015 Mike Fährmann +# Copyright 2014-2016 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Downloader module for text urls""" +"""Downloader module for text:// urls""" from .common import BasicDownloader class Downloader(BasicDownloader): - def __init__(self, *args): + def __init__(self, output): BasicDownloader.__init__(self) + self.out = output - def download_impl(self, url, file): - file.write(bytes(url[7:], "utf-8")) - return 0 + def download_impl(self, url, pathfmt): + if not pathfmt.has_extension: + pathfmt.set_extension("txt") + if pathfmt.exists(): + self.out.skip(pathfmt.path) + return + + self.out.start(pathfmt.path) + with pathfmt.open() as file: + file.write(bytes(url[7:], "utf-8")) + self.out.success(pathfmt.path, 0) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index ef30d6eb..0730d601 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -1,16 +1,14 @@ # -*- coding: utf-8 -*- -# Copyright 2015 Mike Fährmann +# Copyright 2015, 2016 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -import os import json import hashlib -import platform -from . import config, extractor, downloader, text, output, exception +from . import extractor, downloader, path, output, exception from .extractor.message import Message class Job(): @@ -73,19 +71,10 @@ class DownloadJob(Job): def __init__(self, url): Job.__init__(self, url) - self.directory = self.get_base_directory() + self.pathfmt = path.PathFormat(self.extractor) self.downloaders = {} self.queue = None - self.printer = output.select() - key = ["extractor", self.extractor.category] - if self.extractor.subcategory: - key.append(self.extractor.subcategory) - self.filename_fmt = config.interpolate( - key + ["filename_fmt"], default=self.extractor.filename_fmt - ) - self.directory_fmt = config.interpolate( - key + ["directory_fmt"], default=self.extractor.directory_fmt - ) + self.out = output.select() def run(self): Job.run(self) @@ -98,29 +87,16 @@ class DownloadJob(Job): def handle_url(self, url, keywords): """Download the resource specified in 'url'""" - filename = text.clean_path(self.filename_fmt.format(**keywords)) - path = os.path.join(self.directory, filename) - realpath = self.adjust_path(path) - if os.path.exists(realpath): - self.printer.skip(path) + self.pathfmt.set_keywords(keywords) + if self.pathfmt.exists(): + self.out.skip(self.pathfmt.path) return dlinstance = self.get_downloader(url) - self.printer.start(path) - with open(realpath, "wb") as file: - tries = dlinstance.download(url, file) - self.printer.success(path, tries) + dlinstance.download(url, self.pathfmt) def handle_directory(self, keywords): """Set and create the target directory for downloads""" - segments = [ - text.clean_path(segment.format(**keywords).strip()) - for segment in self.directory_fmt - ] - self.directory = os.path.join( - self.get_base_directory(), - *segments - ) - os.makedirs(self.adjust_path(self.directory), exist_ok=True) + self.pathfmt.set_directory(keywords) def handle_queue(self, url): """Add url to work-queue""" @@ -144,23 +120,10 @@ class DownloadJob(Job): instance = self.downloaders.get(scheme) if instance is None: klass = downloader.find(scheme) - instance = klass(self.printer) + instance = klass(self.out) self.downloaders[scheme] = instance return instance - @staticmethod - def get_base_directory(): - """Return the base-destination-directory for downloads""" - bdir = config.get(("base-directory",), default=(".", "gallery-dl")) - if not isinstance(bdir, str): - bdir = os.path.join(*bdir) - return os.path.expanduser(os.path.expandvars(bdir)) - - @staticmethod - def adjust_path(path, longpaths=platform.system() == "Windows"): - """Enable longer-than-260-character paths on windows""" - return "\\\\?\\" + os.path.abspath(path) if longpaths else path - class KeywordJob(Job): """Print available keywords""" @@ -207,6 +170,17 @@ class HashJob(DownloadJob): def __init__(self, hashobj): self.hashobj = hashobj + self.path = "" + self.has_extension = True + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + def open(self): + return self def write(self, content): """Update SHA1 hash""" diff --git a/gallery_dl/path.py b/gallery_dl/path.py new file mode 100644 index 00000000..ba09b534 --- /dev/null +++ b/gallery_dl/path.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os +from . import config, text + +class PathFormat(): + + def __init__(self, extractor): + key = ["extractor", extractor.category] + if extractor.subcategory: + key.append(extractor.subcategory) + self.filename_fmt = config.interpolate( + key + ["filename_fmt"], default=extractor.filename_fmt + ) + self.directory_fmt = config.interpolate( + key + ["directory_fmt"], default=extractor.directory_fmt + ) + self.has_extension = False + self.keywords = {} + self.directory = self.realdirectory = "" + self.path = self.realpath = "" + + def open(self): + """Open file ta 'realpath' and return a corresponding file object""" + return open(self.realpath, "wb") + + def exists(self): + """Return True if 'path' is complete and referse to an existing path""" + if self.has_extension: + return os.path.exists(self.realpath) + return False + + def set_directory(self, keywords): + """Build directory path and create it if necessary""" + segments = [ + text.clean_path(segment.format(**keywords).strip()) + for segment in self.directory_fmt + ] + self.directory = os.path.join( + self.get_base_directory(), + *segments + ) + self.realdirectory = self.adjust_path(self.directory) + os.makedirs(self.realdirectory, exist_ok=True) + + def set_keywords(self, keywords): + """Set filename keywords""" + self.keywords = keywords + self.has_extension = bool(keywords.get("extension")) + if self.has_extension: + self.build_path() + + def set_extension(self, extension): + """Set the 'extension' keyword""" + self.has_extension = True + self.keywords["extension"] = extension + self.build_path() + + def build_path(self, sep=os.path.sep): + """Use filename-keywords and directory to build a full path""" + filename = text.clean_path(self.filename_fmt.format(**self.keywords)) + self.path = self.directory + sep + filename + self.realpath = self.realdirectory + sep + filename + + @staticmethod + def get_base_directory(): + """Return the base-destination-directory for downloads""" + bdir = config.get(("base-directory",), default=(".", "gallery-dl")) + if not isinstance(bdir, str): + bdir = os.path.join(*bdir) + return os.path.expanduser(os.path.expandvars(bdir)) + + @staticmethod + def adjust_path(path): + """Enable longer-than-260-character paths on windows""" + return "\\\\?\\" + os.path.abspath(path) if os.name == "nt" else path