From 39d9c362e4a0bcb8f916da2c49276ef147194350 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 7 Nov 2022 16:33:26 +0100 Subject: [PATCH] include 'http-metadata' in '-K' output --- gallery_dl/downloader/http.py | 20 +------------------- gallery_dl/job.py | 24 +++++++++++++++--------- gallery_dl/util.py | 17 +++++++++++++++++ 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 2e7e76e6..8e112dc6 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -14,8 +14,6 @@ from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase from .. import text, util -from email.utils import parsedate_tz -from datetime import datetime from ssl import SSLError try: from OpenSSL.SSL import Error as OpenSSLError @@ -197,7 +195,7 @@ class HttpDownloader(DownloaderBase): # set metadata from HTTP headers if self.metadata: - kwdict[self.metadata] = self._extract_metadata(response) + kwdict[self.metadata] = util.extract_headers(response) pathfmt.build_path() if pathfmt.exists(): pathfmt.temppath = "" @@ -305,22 +303,6 @@ class HttpDownloader(DownloaderBase): t1 = t2 - def _extract_metadata(self, response): - headers = response.headers - data = dict(headers) - - hcd = headers.get("content-disposition") - if hcd: - name = text.extr(hcd, 'filename="', '"') - if name: - text.nameext_from_url(name, data) - - hlm = headers.get("last-modified") - if hlm: - data["date"] = datetime(*parsedate_tz(hlm)[:6]) - - return data - def _find_extension(self, response): """Get filename extension from MIME type""" mtype = response.headers.get("Content-Type", "image/jpeg") diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 5fe6dfbb..c03a7c90 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -32,11 +32,8 @@ class Job(): self.pathfmt = None self.kwdict = {} self.status = 0 - self.url_key = extr.config("url-metadata") - path_key = extr.config("path-metadata") path_proxy = output.PathfmtProxy(self) - self._logger_extra = { "job" : self, "extractor": extr, @@ -56,12 +53,16 @@ class Job(): extr.category = pextr.category extr.subcategory = pextr.subcategory + self.metadata_url = extr.config("url-metadata") + self.metadata_http = extr.config("http-metadata") + metadata_path = extr.config("path-metadata") + # user-supplied metadata kwdict = extr.config("keywords") if kwdict: self.kwdict.update(kwdict) - if path_key: - self.kwdict[path_key] = path_proxy + if metadata_path: + self.kwdict[metadata_path] = path_proxy # predicates self.pred_url = self._prepare_predicates("image", True) @@ -120,8 +121,8 @@ class Job(): """Call the appropriate message handler""" if msg[0] == Message.Url: _, url, kwdict = msg - if self.url_key: - kwdict[self.url_key] = url + if self.metadata_url: + kwdict[self.metadata_url] = url if self.pred_url(url, kwdict): self.update_kwdict(kwdict) self.handle_url(url, kwdict) @@ -132,8 +133,8 @@ class Job(): elif msg[0] == Message.Queue: _, url, kwdict = msg - if self.url_key: - kwdict[self.url_key] = url + if self.metadata_url: + kwdict[self.metadata_url] = url if self.pred_queue(url, kwdict): self.handle_queue(url, kwdict) @@ -557,6 +558,11 @@ class KeywordJob(Job): def handle_url(self, url, kwdict): stdout_write("\nKeywords for filenames and --filter:\n" "------------------------------------\n") + + if self.metadata_http: + kwdict[self.metadata_http] = util.extract_headers( + self.extractor.request(url, method="HEAD")) + self.print_kwdict(kwdict) raise exception.StopExtraction() diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 18faec46..8ce1fb40 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -274,6 +274,23 @@ Response Headers fp.write(response.content) +def extract_headers(response): + headers = response.headers + data = dict(headers) + + hcd = headers.get("content-disposition") + if hcd: + name = text.extr(hcd, 'filename="', '"') + if name: + text.nameext_from_url(name, data) + + hlm = headers.get("last-modified") + if hlm: + data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6]) + + return data + + @functools.lru_cache(maxsize=None) def git_head(): try: