2015-04-10 21:45:41 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2017-03-16 04:17:35 +01:00
|
|
|
# Copyright 2014-2017 Mike Fährmann
|
2015-04-10 21:45:41 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
"""Downloader module for http:// and https:// urls"""
|
2015-04-10 21:45:41 +02:00
|
|
|
|
2014-10-12 21:56:44 +02:00
|
|
|
import time
|
|
|
|
import requests
|
2017-03-16 04:17:35 +01:00
|
|
|
import requests.exceptions as rexcepts
|
2016-09-30 12:32:48 +02:00
|
|
|
import mimetypes
|
2017-04-26 12:31:42 +02:00
|
|
|
import logging
|
2015-12-01 21:22:58 +01:00
|
|
|
from .common import BasicDownloader
|
2017-03-16 04:17:35 +01:00
|
|
|
from .. import config
|
2014-10-12 21:56:44 +02:00
|
|
|
|
2017-04-26 12:31:42 +02:00
|
|
|
log = logging.getLogger("http")
|
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2014-10-12 21:56:44 +02:00
|
|
|
class Downloader(BasicDownloader):
|
|
|
|
|
2017-03-27 13:22:02 +02:00
|
|
|
retries = config.interpolate(("downloader", "http", "retries",), 5)
|
2017-03-26 18:24:46 +02:00
|
|
|
timeout = config.interpolate(("downloader", "http", "timeout",), None)
|
|
|
|
|
2016-09-30 12:32:48 +02:00
|
|
|
def __init__(self, output):
|
2014-10-12 21:56:44 +02:00
|
|
|
BasicDownloader.__init__(self)
|
2015-04-10 19:19:12 +02:00
|
|
|
self.session = requests.session()
|
2016-09-30 12:32:48 +02:00
|
|
|
self.out = output
|
2014-10-12 21:56:44 +02:00
|
|
|
|
2016-09-30 12:32:48 +02:00
|
|
|
def download_impl(self, url, pathfmt):
|
2014-10-12 21:56:44 +02:00
|
|
|
tries = 0
|
2017-03-16 04:17:35 +01:00
|
|
|
msg = ""
|
2014-10-12 21:56:44 +02:00
|
|
|
while True:
|
2017-03-16 04:17:35 +01:00
|
|
|
tries += 1
|
|
|
|
if tries > 1:
|
2017-03-27 13:22:02 +02:00
|
|
|
self.out.error(pathfmt.path, msg, tries-1, self.retries)
|
|
|
|
if tries > self.retries:
|
2017-03-16 04:17:35 +01:00
|
|
|
return
|
|
|
|
time.sleep(1)
|
|
|
|
|
2014-10-12 21:56:44 +02:00
|
|
|
# try to connect to remote source
|
|
|
|
try:
|
2017-03-16 04:17:35 +01:00
|
|
|
response = self.session.get(
|
|
|
|
url, stream=True, timeout=self.timeout
|
|
|
|
)
|
|
|
|
except (rexcepts.ConnectionError, rexcepts.Timeout) as exception:
|
|
|
|
msg = exception
|
2014-10-12 21:56:44 +02:00
|
|
|
continue
|
2017-03-16 04:17:35 +01:00
|
|
|
except (rexcepts.RequestException, UnicodeError) as exception:
|
|
|
|
msg = exception
|
|
|
|
break
|
2014-10-12 21:56:44 +02:00
|
|
|
|
|
|
|
# reject error-status-codes
|
2017-03-16 04:17:35 +01:00
|
|
|
if response.status_code != 200:
|
|
|
|
msg = 'HTTP status "{} {}"'.format(
|
|
|
|
response.status_code, response.reason
|
2017-01-30 19:40:15 +01:00
|
|
|
)
|
2017-03-16 04:17:35 +01:00
|
|
|
response.close()
|
2016-11-23 13:07:44 +01:00
|
|
|
if response.status_code == 404:
|
2017-03-16 04:17:35 +01:00
|
|
|
break
|
2014-10-12 21:56:44 +02:00
|
|
|
continue
|
|
|
|
|
2017-03-16 04:17:35 +01:00
|
|
|
if not pathfmt.has_extension:
|
|
|
|
# set 'extension' keyword from Content-Type header
|
|
|
|
mtype = response.headers.get("Content-Type", "image/jpeg")
|
|
|
|
exts = mimetypes.guess_all_extensions(mtype, strict=False)
|
2017-04-26 12:31:42 +02:00
|
|
|
if exts:
|
|
|
|
exts.sort()
|
|
|
|
pathfmt.set_extension(exts[-1][1:])
|
|
|
|
else:
|
|
|
|
log.warning("No file extension found for MIME type '%s'",
|
|
|
|
mtype)
|
|
|
|
pathfmt.set_extension("txt")
|
2017-03-16 04:17:35 +01:00
|
|
|
if pathfmt.exists():
|
|
|
|
self.out.skip(pathfmt.path)
|
|
|
|
response.close()
|
|
|
|
return
|
2014-10-12 21:56:44 +02:00
|
|
|
|
2017-03-16 04:17:35 +01:00
|
|
|
# everything ok -- proceed to download
|
|
|
|
self.out.start(pathfmt.path)
|
|
|
|
self.downloading = True
|
2017-03-21 15:53:43 +01:00
|
|
|
try:
|
|
|
|
with pathfmt.open() as file:
|
|
|
|
for data in response.iter_content(16384):
|
2017-03-16 04:17:35 +01:00
|
|
|
file.write(data)
|
2017-03-21 15:53:43 +01:00
|
|
|
except rexcepts.RequestException as exception:
|
|
|
|
msg = exception
|
|
|
|
response.close()
|
|
|
|
continue
|
2017-03-16 04:17:35 +01:00
|
|
|
self.downloading = False
|
|
|
|
self.out.success(pathfmt.path, tries)
|
|
|
|
return
|
2016-09-30 12:32:48 +02:00
|
|
|
|
2017-03-16 04:17:35 +01:00
|
|
|
# output for unrecoverable errors
|
|
|
|
self.out.error(pathfmt.path, msg, tries, 0)
|
2015-04-08 19:06:50 +02:00
|
|
|
|
|
|
|
def set_headers(self, headers):
|
2015-04-10 21:45:41 +02:00
|
|
|
"""Set headers for http requests"""
|
2015-04-08 19:06:50 +02:00
|
|
|
self.set_dict(self.session.headers, headers)
|
|
|
|
|
|
|
|
def set_cookies(self, cookies):
|
2015-04-10 21:45:41 +02:00
|
|
|
"""Set cookies for http requests"""
|
2015-04-08 19:06:50 +02:00
|
|
|
self.set_dict(self.session.cookies, cookies)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def set_dict(dest, src):
|
2015-04-10 21:45:41 +02:00
|
|
|
"""Copy the contents of dictionary 'src' to 'dest'"""
|
2015-04-08 19:06:50 +02:00
|
|
|
dest.clear()
|
|
|
|
dest.update(src)
|