gallery-dl/gallery_dl/extractor/directlink.py

# -*- coding: utf-8 -*-

# Copyright 2017-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Direct link handling"""

from .common import Extractor, Message
from .. import text


class DirectlinkExtractor(Extractor):
    """Extractor for direct links to images and other media files"""
    category = "directlink"
    filename_fmt = "{domain}/{path}/{filename}.{extension}"
    archive_fmt = filename_fmt
    pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\."
               r"(?:jpe?g|jpe|png|gif|bmp|svg|web[mp]|avif|heic|psd"
               r"|mp4|m4v|mov|mkv|og[gmv]|wav|mp3|opus|zip|rar|7z|pdf|swf))"
               r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
    example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.data = match.groupdict()

    def items(self):
        data = self.data
        for key, value in data.items():
            if value:
                data[key] = text.unquote(value)

        data["path"], _, name = data["path"].rpartition("/")
        data["filename"], _, ext = name.rpartition(".")
        data["extension"] = ext.lower()
        data["_http_headers"] = {
            "Referer": self.url.encode("latin-1", "ignore")}

        yield Message.Directory, data
        yield Message.Url, self.url, data
support direct image links 2017-05-24 12:51:18 +02:00			`# -- coding: utf-8 --`

remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`# Copyright 2017-2023 Mike Fährmann`
support direct image links 2017-05-24 12:51:18 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Direct link handling"""`

			`from .common import Extractor, Message`
			`from .. import text`


			`class DirectlinkExtractor(Extractor):`
[directlink] update URL pattern & PEP 8 - combine some file extensions - don't match '.je' - line length < 80 2017-07-27 20:46:15 +02:00			`"""Extractor for direct links to images and other media files"""`
support direct image links 2017-05-24 12:51:18 +02:00			`category = "directlink"`
[directlink] separate filenames from paths With this, all default filename formats specify an '{extension}' and PathFormat.set_extension() reliably works for all files. 2019-11-28 23:39:35 +01:00			`filename_fmt = "{domain}/{path}/{filename}.{extension}"`
			`archive_fmt = filename_fmt`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\."`
[directlink] extend recognized file extensions (#5924) bmp, svg, avif, heic, psd, pdf, m4v, mov, wav, mp3, zip, rar, 7z, swf 2024-08-02 12:21:41 +02:00			`r"(?:jpe?g\|jpe\|png\|gif\|bmp\|svg\|web[mp]\|avif\|heic\|psd"`
			`r"\|mp4\|m4v\|mov\|mkv\|og[gmv]\|wav\|mp3\|opus\|zip\|rar\|7z\|pdf\|swf))"`
allow '/' and '?' in URL queries 2022-10-02 19:02:05 +02:00			`r"(?:\?(?P<query>[^#]))?(?:#(?P<fragment>.))?$")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png"`
support direct image links 2017-05-24 12:51:18 +02:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[directlink] improve URL pattern 2017-08-02 21:06:49 +02:00			`self.data = match.groupdict()`
support direct image links 2017-05-24 12:51:18 +02:00
			`def items(self):`
[directlink] separate filenames from paths With this, all default filename formats specify an '{extension}' and PathFormat.set_extension() reliably works for all files. 2019-11-28 23:39:35 +01:00			`data = self.data`
			`for key, value in data.items():`
[directlink] unquote metadata fields 2018-02-26 02:12:47 +01:00			`if value:`
[directlink] separate filenames from paths With this, all default filename formats specify an '{extension}' and PathFormat.set_extension() reliably works for all files. 2019-11-28 23:39:35 +01:00			`data[key] = text.unquote(value)`
[directlink] send Referer headers (closes #536) 2019-12-25 17:17:07 +01:00
[directlink] separate filenames from paths With this, all default filename formats specify an '{extension}' and PathFormat.set_extension() reliably works for all files. 2019-11-28 23:39:35 +01:00			`data["path"], _, name = data["path"].rpartition("/")`
			`data["filename"], _, ext = name.rpartition(".")`
			`data["extension"] = ext.lower()`
[directlink] manually encode Referer URLs (fixes #1647) Trying to send a non-latin-1-encodable header raises an exception, so we encode the Referer value ourselves with 'errors=ignore'. 2021-06-21 20:28:19 +02:00			`data["_http_headers"] = {`
			`"Referer": self.url.encode("latin-1", "ignore")}`
[directlink] unquote metadata fields 2018-02-26 02:12:47 +01:00
[directlink] separate filenames from paths With this, all default filename formats specify an '{extension}' and PathFormat.set_extension() reliably works for all files. 2019-11-28 23:39:35 +01:00			`yield Message.Directory, data`
			`yield Message.Url, self.url, data`