gallery-dl/gallery_dl/job.py

# -*- coding: utf-8 -*-

# Copyright 2015-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

import sys
import json
import hashlib
from . import extractor, downloader, config, util, output, exception
from .extractor.message import Message


class Job():
    """Base class for Job-types"""
    ufile = None

    def __init__(self, url):
        self.url = url
        self.extractor = extractor.find(url)
        if self.extractor is None:
            raise exception.NoExtractorError(url)
        self.extractor.log.debug(
            "Using %s for %s", self.extractor.__class__.__name__, url)

        items = config.get(("images",))
        if items:
            pred = util.RangePredicate(items)
            if pred.lower > 1:
                pred.index += self.extractor.skip(pred.lower - 1)
            self.pred_url = pred
        else:
            self.pred_url = True

        items = config.get(("chapters",))
        self.pred_queue = util.RangePredicate(items) if items else True

    def run(self):
        """Execute or run the job"""
        try:
            log = self.extractor.log
            for msg in self.extractor:
                self.dispatch(msg)
        except exception.AuthenticationError:
            log.error("Authentication failed. Please provide a valid "
                      "username/password pair.")
        except exception.AuthorizationError:
            log.error("You do not have permission to access the resource "
                      "at '%s'", self.url)
        except exception.NotFoundError as exc:
            res = str(exc) or "resource (gallery/image/user)"
            log.error("The %s at '%s' does not exist", res, self.url)
        except exception.HttpError as exc:
            log.error("HTTP request failed:\n%s", exc)
        except exception.StopExtraction:
            pass
        except OSError as exc:
            log.error("Unable to download data: %s", exc)
        except Exception as exc:
            log.error(("An unexpected error occurred: %s - %s. "
                       "Please run gallery-dl again with the --verbose flag, "
                       "copy its output and report this issue on "
                       "https://github.com/mikf/gallery-dl/issues ."),
                      exc.__class__.__name__, exc)
            log.debug("Traceback", exc_info=True)

    def dispatch(self, msg):
        """Call the appropriate message handler"""
        if msg[0] == Message.Url:
            if self.pred_url:
                self.update_kwdict(msg[2])
                self.handle_url(msg[1], msg[2])

        elif msg[0] == Message.Directory:
            self.update_kwdict(msg[1])
            self.handle_directory(msg[1])

        elif msg[0] == Message.Queue:
            if self.pred_queue:
                self.handle_queue(msg[1])

        elif msg[0] == Message.Version:
            if msg[1] != 1:
                raise "unsupported message-version ({}, {})".format(
                    self.extractor.category, msg[1]
                )
            # TODO: support for multiple message versions

    def handle_url(self, url, keywords):
        """Handle Message.Url"""

    def handle_directory(self, keywords):
        """Handle Message.Directory"""

    def handle_queue(self, url):
        """Handle Message.Queue"""

    def update_kwdict(self, kwdict):
        """Add 'category' and 'subcategory' keywords"""
        kwdict["category"] = self.extractor.category
        kwdict["subcategory"] = self.extractor.subcategory

    def _write_unsupported(self, url):
        if self.ufile:
            print(url, file=self.ufile, flush=True)


class DownloadJob(Job):
    """Download images into appropriate directory/filename locations"""

    def __init__(self, url):
        Job.__init__(self, url)
        self.pathfmt = util.PathFormat(self.extractor)
        self.downloaders = {}
        self.out = output.select()

    def handle_url(self, url, keywords):
        """Download the resource specified in 'url'"""
        self.pathfmt.set_keywords(keywords)
        if self.pathfmt.exists():
            self.out.skip(self.pathfmt.path)
            return
        dlinstance = self.get_downloader(url)
        dlinstance.download(url, self.pathfmt)

    def handle_directory(self, keywords):
        """Set and create the target directory for downloads"""
        self.pathfmt.set_directory(keywords)

    def handle_queue(self, url):
        try:
            DownloadJob(url).run()
        except exception.NoExtractorError:
            self._write_unsupported(url)

    def get_downloader(self, url):
        """Return, and possibly construct, a downloader suitable for 'url'"""
        pos = url.find(":")
        scheme = url[:pos] if pos != -1 else "http"
        if scheme == "https":
            scheme = "http"
        instance = self.downloaders.get(scheme)
        if instance is None:
            klass = downloader.find(scheme)
            instance = klass(self.extractor.session, self.out)
            self.downloaders[scheme] = instance
        return instance


class KeywordJob(Job):
    """Print available keywords"""

    def handle_url(self, url, keywords):
        print("\nKeywords for filenames:")
        print("-----------------------")
        self.print_keywords(keywords)
        raise exception.StopExtraction()

    def handle_directory(self, keywords):
        print("Keywords for directory names:")
        print("-----------------------------")
        self.print_keywords(keywords)

    def handle_queue(self, url):
        print("This extractor transfers work to other extractors and does not "
              "provide any keywords on its own. Try "
              "'gallery-dl --list-keywords \"", url, "\"' instead.", sep="")
        raise exception.StopExtraction()

    @staticmethod
    def print_keywords(keywords, prefix=""):
        """Print key-value pairs with formatting"""
        suffix = "]" if prefix else ""
        for key, value in sorted(keywords.items()):
            key = prefix + key + suffix

            if isinstance(value, dict):
                KeywordJob.print_keywords(value, key + "[")

            elif isinstance(value, list):
                if value and isinstance(value[0], dict):
                    KeywordJob.print_keywords(value[0], key + "[][")
                else:
                    print(key, "[]", sep="")
                    for val in value:
                        print("  -", val)

            else:
                # string or number
                print(key, "\n  ", value, sep="")


class UrlJob(Job):
    """Print download urls"""
    maxdepth = -1

    def __init__(self, url, depth=1):
        Job.__init__(self, url)
        self.depth = depth
        if depth == self.maxdepth:
            self.handle_queue = print

    @staticmethod
    def handle_url(url, _):
        print(url)

    def handle_queue(self, url):
        try:
            UrlJob(url, self.depth + 1).run()
        except exception.NoExtractorError:
            self._write_unsupported(url)


class TestJob(DownloadJob):
    """Generate test-results for extractor runs"""

    class HashIO():
        """Minimal file-like interface"""

        def __init__(self, hashobj):
            self.hashobj = hashobj
            self.path = ""
            self.has_extension = True

        def __enter__(self):
            return self

        def __exit__(self, *args):
            pass

        def open(self):
            return self

        def write(self, content):
            """Update SHA1 hash"""
            self.hashobj.update(content)

    def __init__(self, url, content=False):
        DownloadJob.__init__(self, url)
        self.content = content
        self.hash_url = hashlib.sha1()
        self.hash_keyword = hashlib.sha1()
        self.hash_content = hashlib.sha1()
        if content:
            self.fileobj = self.HashIO(self.hash_content)

    def run(self):
        for msg in self.extractor:
            self.dispatch(msg)

    def handle_url(self, url, keywords):
        self.update_url(url)
        self.update_keyword(keywords)
        self.update_content(url)

    def handle_directory(self, keywords):
        self.update_keyword(keywords)

    def handle_queue(self, url):
        self.update_url(url)

    def update_url(self, url):
        """Update the URL hash"""
        self.hash_url.update(url.encode())

    def update_keyword(self, kwdict):
        """Update the keyword hash"""
        self.hash_keyword.update(
            json.dumps(kwdict, sort_keys=True).encode()
        )

    def update_content(self, url):
        """Update the content hash"""
        if self.content:
            self.get_downloader(url).download(url, self.fileobj)


class DataJob(Job):
    """Collect extractor results and dump them"""

    def __init__(self, url, file=sys.stdout):
        Job.__init__(self, url)
        self.file = file
        self.data = []
        self.ensure_ascii = config.get(("output", "ascii"), True)

    def run(self):
        # collect data
        try:
            for msg in self.extractor:
                copy = [
                    part.copy() if hasattr(part, "copy") else part
                    for part in msg
                ]
                self.data.append(copy)
        except Exception as exc:
            self.data.append((exc.__class__.__name__, str(exc)))

        # dump to 'file'
        json.dump(
            self.data, self.file,
            sort_keys=True, indent=2, ensure_ascii=self.ensure_ascii
        )
        self.file.write("\n")
move DownloadManager and ExtractorFinder 2015-04-05 16:23:20 +02:00			`# -- coding: utf-8 --`

add '--items' option this allows to specify which manga-chapters/comic-issues to download when using gallery-dl on a manga/comic URL 2017-02-20 22:02:49 +01:00			`# Copyright 2015-2017 Mike Fährmann`
move DownloadManager and ExtractorFinder 2015-04-05 16:23:20 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

add '-j/--dump-json' option this outputs the extractor-results in JSON format rather then downloading files 2017-04-12 18:43:41 +02:00			`import sys`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`import json`
			`import hashlib`
move code into util.py 2017-03-28 13:12:44 +02:00			`from . import extractor, downloader, config, util, output, exception`
delay 'requests'-import 2015-11-24 19:47:51 +01:00			`from .extractor.message import Message`
move DownloadManager and ExtractorFinder 2015-04-05 16:23:20 +02:00
code adjustments according to pep8 2017-01-30 19:40:15 +01:00
add base class for job types 2015-12-12 00:11:05 +01:00			`class Job():`
			`"""Base class for Job-types"""`
add `--write-unsupported` option (#15) 2017-05-27 16:16:57 +02:00			`ufile = None`
update download-infrastructure 2015-04-08 01:51:48 +02:00
remove DownloadManager class 2015-11-12 02:35:30 +01:00			`def __init__(self, url):`
move some exception handling code 2017-02-25 23:53:31 +01:00			`self.url = url`
restructure info-parameters 2015-11-21 00:30:31 +01:00			`self.extractor = extractor.find(url)`
sanatize output 2015-06-28 12:45:52 +02:00			`if self.extractor is None:`
rename a few files 2016-07-14 14:25:56 +02:00			`raise exception.NoExtractorError(url)`
code cleanup and fixing tests 2017-06-02 09:10:58 +02:00			`self.extractor.log.debug(`
			`"Using %s for %s", self.extractor.__class__.__name__, url)`
add base class for job types 2015-12-12 00:11:05 +01:00
implement '--images' and '--chapters' options - the former '--items' has been renamed to '--chapters' - #6 2017-02-23 21:51:29 +01:00			`items = config.get(("images",))`
implement basic way to tell extractors to skip ahead 2017-03-03 17:26:50 +01:00			`if items:`
			`pred = util.RangePredicate(items)`
			`if pred.lower > 1:`
			`pred.index += self.extractor.skip(pred.lower - 1)`
			`self.pred_url = pred`
			`else:`
			`self.pred_url = True`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00
implement '--images' and '--chapters' options - the former '--items' has been renamed to '--chapters' - #6 2017-02-23 21:51:29 +01:00			`items = config.get(("chapters",))`
			`self.pred_queue = util.RangePredicate(items) if items else True`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00
implement '--images' and '--chapters' options - the former '--items' has been renamed to '--chapters' - #6 2017-02-23 21:51:29 +01:00			`def run(self):`
			`"""Execute or run the job"""`
			`try:`
use logging to report errors 2017-03-11 01:47:57 +01:00			`log = self.extractor.log`
implement '--images' and '--chapters' options - the former '--items' has been renamed to '--chapters' - #6 2017-02-23 21:51:29 +01:00			`for msg in self.extractor:`
fix exception based tests 2017-02-26 02:06:56 +01:00			`self.dispatch(msg)`
move some exception handling code 2017-02-25 23:53:31 +01:00			`except exception.AuthenticationError:`
use logging to report errors 2017-03-11 01:47:57 +01:00			`log.error("Authentication failed. Please provide a valid "`
			`"username/password pair.")`
move some exception handling code 2017-02-25 23:53:31 +01:00			`except exception.AuthorizationError:`
use logging to report errors 2017-03-11 01:47:57 +01:00			`log.error("You do not have permission to access the resource "`
			`"at '%s'", self.url)`
improve 'extractor.request' - add 'fatal' argument - improve internal logic and flow - raise known exception on error - update exception hierarchy 2017-08-05 16:11:46 +02:00			`except exception.NotFoundError as exc:`
			`res = str(exc) or "resource (gallery/image/user)"`
use logging to report errors 2017-03-11 01:47:57 +01:00			`log.error("The %s at '%s' does not exist", res, self.url)`
improve 'extractor.request' - add 'fatal' argument - improve internal logic and flow - raise known exception on error - update exception hierarchy 2017-08-05 16:11:46 +02:00			`except exception.HttpError as exc:`
			`log.error("HTTP request failed:\n%s", exc)`
implement '--images' and '--chapters' options - the former '--items' has been renamed to '--chapters' - #6 2017-02-23 21:51:29 +01:00			`except exception.StopExtraction:`
			`pass`
update error message for unspecified exceptions - ask user to report unexpected errors, which usually indicate extractor failure - handle OSErrors separately (permissions, disk full, etc) - revert 30eef52 2017-08-10 16:29:05 +02:00			`except OSError as exc:`
			`log.error("Unable to download data: %s", exc)`
add -v/--verbose option and reduce error verbosity (#12) 2017-04-18 11:38:48 +02:00			`except Exception as exc:`
update error message for unspecified exceptions - ask user to report unexpected errors, which usually indicate extractor failure - handle OSErrors separately (permissions, disk full, etc) - revert 30eef52 2017-08-10 16:29:05 +02:00			`log.error(("An unexpected error occurred: %s - %s. "`
			`"Please run gallery-dl again with the --verbose flag, "`
			`"copy its output and report this issue on "`
			`"https://github.com/mikf/gallery-dl/issues ."),`
			`exc.__class__.__name__, exc)`
			`log.debug("Traceback", exc_info=True)`
add -v/--verbose option and reduce error verbosity (#12) 2017-04-18 11:38:48 +02:00
fix exception based tests 2017-02-26 02:06:56 +01:00			`def dispatch(self, msg):`
			`"""Call the appropriate message handler"""`
fix/improve various things 2017-03-17 09:39:46 +01:00			`if msg[0] == Message.Url:`
			`if self.pred_url:`
			`self.update_kwdict(msg[2])`
			`self.handle_url(msg[1], msg[2])`
fix exception based tests 2017-02-26 02:06:56 +01:00
			`elif msg[0] == Message.Directory:`
			`self.update_kwdict(msg[1])`
			`self.handle_directory(msg[1])`

fix/improve various things 2017-03-17 09:39:46 +01:00			`elif msg[0] == Message.Queue:`
			`if self.pred_queue:`
			`self.handle_queue(msg[1])`
fix exception based tests 2017-02-26 02:06:56 +01:00
			`elif msg[0] == Message.Version:`
			`if msg[1] != 1:`
			`raise "unsupported message-version ({}, {})".format(`
			`self.extractor.category, msg[1]`
			`)`
			`# TODO: support for multiple message versions`

add '-j/--dump-json' option this outputs the extractor-results in JSON format rather then downloading files 2017-04-12 18:43:41 +02:00			`def handle_url(self, url, keywords):`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`"""Handle Message.Url"""`

			`def handle_directory(self, keywords):`
			`"""Handle Message.Directory"""`

			`def handle_queue(self, url):`
			`"""Handle Message.Queue"""`

			`def update_kwdict(self, kwdict):`
			`"""Add 'category' and 'subcategory' keywords"""`
			`kwdict["category"] = self.extractor.category`
			`kwdict["subcategory"] = self.extractor.subcategory`
add base class for job types 2015-12-12 00:11:05 +01:00
add `--write-unsupported` option (#15) 2017-05-27 16:16:57 +02:00			`def _write_unsupported(self, url):`
			`if self.ufile:`
			`print(url, file=self.ufile, flush=True)`

code adjustments according to pep8 2017-01-30 19:40:15 +01:00
add base class for job types 2015-12-12 00:11:05 +01:00			`class DownloadJob(Job):`
			`"""Download images into appropriate directory/filename locations"""`

			`def __init__(self, url):`
			`Job.__init__(self, url)`
move code into util.py 2017-03-28 13:12:44 +02:00			`self.pathfmt = util.PathFormat(self.extractor)`
update download-infrastructure 2015-04-08 01:51:48 +02:00			`self.downloaders = {}`
get extension from Content-Type header if not provided 2016-09-30 12:32:48 +02:00			`self.out = output.select()`
update download-infrastructure 2015-04-08 01:51:48 +02:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_url(self, url, keywords):`
			`"""Download the resource specified in 'url'"""`
get extension from Content-Type header if not provided 2016-09-30 12:32:48 +02:00			`self.pathfmt.set_keywords(keywords)`
			`if self.pathfmt.exists():`
			`self.out.skip(self.pathfmt.path)`
update download-infrastructure 2015-04-08 01:51:48 +02:00			`return`
remove DownloadManager class 2015-11-12 02:35:30 +01:00			`dlinstance = self.get_downloader(url)`
get extension from Content-Type header if not provided 2016-09-30 12:32:48 +02:00			`dlinstance.download(url, self.pathfmt)`
update download-infrastructure 2015-04-08 01:51:48 +02:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_directory(self, keywords):`
update download-infrastructure 2015-04-08 01:51:48 +02:00			`"""Set and create the target directory for downloads"""`
get extension from Content-Type header if not provided 2016-09-30 12:32:48 +02:00			`self.pathfmt.set_directory(keywords)`
update download-infrastructure 2015-04-08 01:51:48 +02:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_queue(self, url):`
			`try:`
run queue items immediately 2017-05-24 15:15:06 +02:00			`DownloadJob(url).run()`
			`except exception.NoExtractorError:`
add `--write-unsupported` option (#15) 2017-05-27 16:16:57 +02:00			`self._write_unsupported(url)`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00
update download-infrastructure 2015-04-08 01:51:48 +02:00			`def get_downloader(self, url):`
			`"""Return, and possibly construct, a downloader suitable for 'url'"""`
			`pos = url.find(":")`
			`scheme = url[:pos] if pos != -1 else "http"`
			`if scheme == "https":`
			`scheme = "http"`
remove DownloadManager class 2015-11-12 02:35:30 +01:00			`instance = self.downloaders.get(scheme)`
			`if instance is None:`
			`klass = downloader.find(scheme)`
share extractor and downloader sessions There was never any "good" reason for the strict separation between extractors and downloaders. This change allows for reduced resource usage (probably unnoticeable) and less lines of code at the "cost" of tighter coupling. 2017-06-30 19:38:14 +02:00			`instance = klass(self.extractor.session, self.out)`
remove DownloadManager class 2015-11-12 02:35:30 +01:00			`self.downloaders[scheme] = instance`
			`return instance`

add KeywordJob class 2015-11-13 01:02:49 +01:00
add base class for job types 2015-12-12 00:11:05 +01:00			`class KeywordJob(Job):`
			`"""Print available keywords"""`
add KeywordJob class 2015-11-13 01:02:49 +01:00
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`def handle_url(self, url, keywords):`
			`print("\nKeywords for filenames:")`
			`print("-----------------------")`
			`self.print_keywords(keywords)`
			`raise exception.StopExtraction()`

			`def handle_directory(self, keywords):`
			`print("Keywords for directory names:")`
			`print("-----------------------------")`
			`self.print_keywords(keywords)`
add KeywordJob class 2015-11-13 01:02:49 +01:00
extend output of --list-keywords 2017-08-10 17:36:21 +02:00			`def handle_queue(self, url):`
			`print("This extractor transfers work to other extractors and does not "`
			`"provide any keywords on its own. Try "`
			`"'gallery-dl --list-keywords \"", url, "\"' instead.", sep="")`
			`raise exception.StopExtraction()`

add KeywordJob class 2015-11-13 01:02:49 +01:00			`@staticmethod`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`def print_keywords(keywords, prefix=""):`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`"""Print key-value pairs with formatting"""`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`suffix = "]" if prefix else ""`
add KeywordJob class 2015-11-13 01:02:49 +01:00			`for key, value in sorted(keywords.items()):`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`key = prefix + key + suffix`
rework the output format for --list-keywords 2017-05-15 18:30:47 +02:00
			`if isinstance(value, dict):`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`KeywordJob.print_keywords(value, key + "[")`
rework the output format for --list-keywords 2017-05-15 18:30:47 +02:00
			`elif isinstance(value, list):`
			`if value and isinstance(value[0], dict):`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`KeywordJob.print_keywords(value[0], key + "[][")`
rework the output format for --list-keywords 2017-05-15 18:30:47 +02:00			`else:`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`print(key, "[]", sep="")`
rework the output format for --list-keywords 2017-05-15 18:30:47 +02:00			`for val in value:`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`print(" -", val)`
rework the output format for --list-keywords 2017-05-15 18:30:47 +02:00
			`else:`
			`# string or number`
code cleanup + fixes 2017-05-17 14:31:14 +02:00			`print(key, "\n ", value, sep="")`
implement -g,--get-urls option 2015-12-10 02:14:28 +01:00

add base class for job types 2015-12-12 00:11:05 +01:00			`class UrlJob(Job):`
			`"""Print download urls"""`
rework the '-g' cmdline option the amount of how often the -g option is given now determines up until what level URLs are resolved. example: $ gallery-dl -g http://kissmanga.com/Manga/Dropout http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847 - when applied to a manga-extractor, specifying the -g option once will now print a list of all chapter URls $ gallery-dl -gg http://kissmanga.com/Manga/Dropout http://2.bp.blogspot.com/.../000.png http://2.bp.blogspot.com/.../001.png ... - specifying it twice (or even more often) will go a level deeper and print the image URLs found in those chapters 2017-02-17 22:18:16 +01:00			`maxdepth = -1`

			`def __init__(self, url, depth=1):`
			`Job.__init__(self, url)`
			`self.depth = depth`
			`if depth == self.maxdepth:`
add `--write-unsupported` option (#15) 2017-05-27 16:16:57 +02:00			`self.handle_queue = print`
implement -g,--get-urls option 2015-12-10 02:14:28 +01:00
[reddit] some small fixes - filter or complete some URLs - remove the 'nofollow:' scheme before printing URLs - (#15) 2017-05-23 11:48:00 +02:00			`@staticmethod`
			`def handle_url(url, _):`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`print(url)`
print urls recursively 2016-08-11 13:20:21 +02:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_queue(self, url):`
			`try:`
rework the '-g' cmdline option the amount of how often the -g option is given now determines up until what level URLs are resolved. example: $ gallery-dl -g http://kissmanga.com/Manga/Dropout http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847 - when applied to a manga-extractor, specifying the -g option once will now print a list of all chapter URls $ gallery-dl -gg http://kissmanga.com/Manga/Dropout http://2.bp.blogspot.com/.../000.png http://2.bp.blogspot.com/.../001.png ... - specifying it twice (or even more often) will go a level deeper and print the image URLs found in those chapters 2017-02-17 22:18:16 +01:00			`UrlJob(url, self.depth + 1).run()`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`except exception.NoExtractorError:`
add `--write-unsupported` option (#15) 2017-05-27 16:16:57 +02:00			`self._write_unsupported(url)`
[reddit] some small fixes - filter or complete some URLs - remove the 'nofollow:' scheme before printing URLs - (#15) 2017-05-23 11:48:00 +02:00
add HashJob for automated testing 2015-12-12 01:16:02 +01:00
fix exception based tests 2017-02-26 02:06:56 +01:00			`class TestJob(DownloadJob):`
			`"""Generate test-results for extractor runs"""`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00			`class HashIO():`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`"""Minimal file-like interface"""`
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00
			`def __init__(self, hashobj):`
			`self.hashobj = hashobj`
get extension from Content-Type header if not provided 2016-09-30 12:32:48 +02:00			`self.path = ""`
			`self.has_extension = True`

			`def __enter__(self):`
			`return self`

			`def __exit__(self, *args):`
			`pass`

			`def open(self):`
			`return self`
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00
			`def write(self, content):`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`"""Update SHA1 hash"""`
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00			`self.hashobj.update(content)`

			`def __init__(self, url, content=False):`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`DownloadJob.__init__(self, url)`
code adjustments according to pep8 2017-01-30 19:40:15 +01:00			`self.content = content`
			`self.hash_url = hashlib.sha1()`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`self.hash_keyword = hashlib.sha1()`
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00			`self.hash_content = hashlib.sha1()`
			`if content:`
			`self.fileobj = self.HashIO(self.hash_content)`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00
fix exception based tests 2017-02-26 02:06:56 +01:00			`def run(self):`
restore exception-testing to its old form 2017-02-27 23:05:08 +01:00			`for msg in self.extractor:`
			`self.dispatch(msg)`
fix exception based tests 2017-02-26 02:06:56 +01:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_url(self, url, keywords):`
			`self.update_url(url)`
			`self.update_keyword(keywords)`
			`self.update_content(url)`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_directory(self, keywords):`
			`self.update_keyword(keywords)`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`def handle_queue(self, url):`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`self.update_url(url)`

			`def update_url(self, url):`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`"""Update the URL hash"""`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`self.hash_url.update(url.encode())`

			`def update_keyword(self, kwdict):`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`"""Update the keyword hash"""`
add HashJob for automated testing 2015-12-12 01:16:02 +01:00			`self.hash_keyword.update(`
			`json.dumps(kwdict, sort_keys=True).encode()`
			`)`
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00
			`def update_content(self, url):`
add (sub)category keyword automatically 2016-09-24 10:45:11 +02:00			`"""Update the content hash"""`
update HashJob to generate hashes for downloaded content 2015-12-21 22:49:04 +01:00			`if self.content:`
			`self.get_downloader(url).download(url, self.fileobj)`
add '-j/--dump-json' option this outputs the extractor-results in JSON format rather then downloading files 2017-04-12 18:43:41 +02:00

			`class DataJob(Job):`
			`"""Collect extractor results and dump them"""`

			`def __init__(self, url, file=sys.stdout):`
			`Job.__init__(self, url)`
			`self.file = file`
			`self.data = []`
			`self.ensure_ascii = config.get(("output", "ascii"), True)`

			`def run(self):`
			`# collect data`
			`try:`
			`for msg in self.extractor:`
share extractor and downloader sessions There was never any "good" reason for the strict separation between extractors and downloaders. This change allows for reduced resource usage (probably unnoticeable) and less lines of code at the "cost" of tighter coupling. 2017-06-30 19:38:14 +02:00			`copy = [`
			`part.copy() if hasattr(part, "copy") else part`
			`for part in msg`
			`]`
add '-j/--dump-json' option this outputs the extractor-results in JSON format rather then downloading files 2017-04-12 18:43:41 +02:00			`self.data.append(copy)`
			`except Exception as exc:`
			`self.data.append((exc.__class__.__name__, str(exc)))`

			`# dump to 'file'`
			`json.dump(`
			`self.data, self.file,`
			`sort_keys=True, indent=2, ensure_ascii=self.ensure_ascii`
			`)`
			`self.file.write("\n")`