gallery-dl/gallery_dl/extractor/directlink.py

# -*- coding: utf-8 -*-

# Copyright 2017-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Direct link handling"""

from .common import Extractor, Message
from .. import text


class DirectlinkExtractor(Extractor):
    """Extractor for direct links to images and other media files"""
    category = "directlink"
    filename_fmt = "{domain}/{path}"
    archive_fmt = "{domain}/{path}"
    pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\."
               r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
               r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
    test = (
        (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
            "url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
            "keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed",
        }),
        # more complex example
        ("https://example.org/path/file.webm?que=1&ry=2#fragment", {
            "url": "fd4aec8a32842343394e6078a06c3e6b647bf671",
            "keyword": "ff75764b1ae66615b723a6357b8193fa2de84678",
        }),
        # percent-encoded characters
        ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {
            "url": "2627e8140727fdf743f86fe18f69f99a052c9718",
            "keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e",
        }),
        # upper case file extension (#296)
        ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"
         ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"
         "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.data = match.groupdict()

    def items(self):
        text.nameext_from_url(self.url, self.data)
        for key, value in self.data.items():
            if value:
                self.data[key] = text.unquote(value)

        yield Message.Version, 1
        yield Message.Directory, self.data
        yield Message.Url, self.url, self.data
support direct image links 2017-05-24 12:51:18 +02:00			`# -- coding: utf-8 --`

simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`# Copyright 2017-2019 Mike Fährmann`
support direct image links 2017-05-24 12:51:18 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Direct link handling"""`

			`from .common import Extractor, Message`
			`from .. import text`


			`class DirectlinkExtractor(Extractor):`
[directlink] update URL pattern & PEP 8 - combine some file extensions - don't match '.je' - line length < 80 2017-07-27 20:46:15 +02:00			`"""Extractor for direct links to images and other media files"""`
support direct image links 2017-05-24 12:51:18 +02:00			`category = "directlink"`
[directlink] update filename format and metadata 2017-05-30 17:33:09 +02:00			`filename_fmt = "{domain}/{path}"`
adjust archive-ids 2018-02-12 23:09:34 +01:00			`archive_fmt = "{domain}/{path}"`
[directlink] make pattern case insensitive (fixes #296) 2019-06-03 10:56:14 +02:00			`pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\."`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`r"(?:jpe?g\|jpe\|png\|gif\|web[mp]\|mp4\|mkv\|og[gmv]\|opus))"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`r"(?:\?(?P<query>[^/?#]))?(?:#(?P<fragment>.))?$")`
			`test = (`
[smugmug] added image and album extractor just some initial code that still requires a lot of work ... TODO: - folders - old-style albums (which are nearly all of them ...) - images from users - OAuth It could also happen that the API credentials used will become invalid whenever my 14 day trial period ends (7 days remaining), but that would just require users to supply their own. 2018-04-29 21:27:25 +02:00			`(("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {`
			`"url": "18c5d00077332e98e53be9fed2ee4be66154b88d",`
change results of text.nameext_from_url() Instead of getting a complete 'filename' from an URL and splitting that into 'name' and 'extension', the new approach gets rid of the complete version and renames 'name' to 'filename'. (Using anything other than {extension} for a filename extension doesn't really work anyway) Example: "https://example.org/path/filename.ext" before: - filename : filename.ext - name : filename - extension: ext now: - filename : filename - extension: ext 2019-02-14 16:07:17 +01:00			`"keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed",`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`}),`
[directlink] unquote metadata fields 2018-02-26 02:12:47 +01:00			`# more complex example`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`("https://example.org/path/file.webm?que=1&ry=2#fragment", {`
			`"url": "fd4aec8a32842343394e6078a06c3e6b647bf671",`
change results of text.nameext_from_url() Instead of getting a complete 'filename' from an URL and splitting that into 'name' and 'extension', the new approach gets rid of the complete version and renames 'name' to 'filename'. (Using anything other than {extension} for a filename extension doesn't really work anyway) Example: "https://example.org/path/filename.ext" before: - filename : filename.ext - name : filename - extension: ext now: - filename : filename - extension: ext 2019-02-14 16:07:17 +01:00			`"keyword": "ff75764b1ae66615b723a6357b8193fa2de84678",`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`}),`
[directlink] unquote metadata fields 2018-02-26 02:12:47 +01:00			`# percent-encoded characters`
			`("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {`
[directlink] update test results 2018-02-26 03:01:23 +01:00			`"url": "2627e8140727fdf743f86fe18f69f99a052c9718",`
change results of text.nameext_from_url() Instead of getting a complete 'filename' from an URL and splitting that into 'name' and 'extension', the new approach gets rid of the complete version and renames 'name' to 'filename'. (Using anything other than {extension} for a filename extension doesn't really work anyway) Example: "https://example.org/path/filename.ext" before: - filename : filename.ext - name : filename - extension: ext now: - filename : filename - extension: ext 2019-02-14 16:07:17 +01:00			`"keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e",`
[directlink] unquote metadata fields 2018-02-26 02:12:47 +01:00			`}),`
[directlink] make pattern case insensitive (fixes #296) 2019-06-03 10:56:14 +02:00			`# upper case file extension (#296)`
			`("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"`
			`".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"`
			`"mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
support direct image links 2017-05-24 12:51:18 +02:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`self.data = match.groupdict()`
support direct image links 2017-05-24 12:51:18 +02:00
			`def items(self):`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`text.nameext_from_url(self.url, self.data)`
[directlink] unquote metadata fields 2018-02-26 02:12:47 +01:00			`for key, value in self.data.items():`
			`if value:`
			`self.data[key] = text.unquote(value)`

support direct image links 2017-05-24 12:51:18 +02:00			`yield Message.Version, 1`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`yield Message.Directory, self.data`
			`yield Message.Url, self.url, self.data`