# -*- coding: utf-8 -*- """Extractor for images in a generic web page.""" from .common import Extractor, Message from .. import config, text import re import os.path class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" category = "generic" directory_fmt = ("{category}", "{pageurl}") archive_fmt = "{imageurl}" # By default, the generic extractor is disabled # and the "g(eneric):" prefix in url is required. # If the extractor is enabled, make the prefix optional pattern = r"(?ix)(?Pg(?:eneric)?:)" if config.get(("extractor", "generic"), "enabled"): pattern += r"?" # The generic extractor pattern should match (almost) any valid url # Based on: https://tools.ietf.org/html/rfc3986#appendix-B pattern += r""" (?Phttps?://)? # optional http(s) scheme (?P[-\w\.]+) # required domain (?P/[^?#]*)? # optional path (?:\?(?P[^#]*))? # optional query (?:\#(?P.*))? # optional fragment """ test = ( ("generic:https://www.nongnu.org/lzip/", { "count": 1, "content": "40be5c77773d3e91db6e1c5df720ee30afb62368", "keyword": { "description": "Lossless data compressor", "imageurl": "https://www.nongnu.org/lzip/lzip.png", "keywords": "lzip, clzip, plzip, lzlib, LZMA, bzip2, " "gzip, data compression, GNU, free software", "pageurl": "https://www.nongnu.org/lzip/", }, }), # internationalized domain name ("generic:https://räksmörgås.josefsson.org/", { "count": 2, "pattern": "^https://räksmörgås.josefsson.org/", }), ("generic:https://en.wikipedia.org/Main_Page"), ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"), ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), ) def __init__(self, match): """Init.""" Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode if match.group('generic'): self.log.info("Forcing use of generic information extractor.") self.url = match.group(0).partition(":")[2] else: self.log.info("Falling back on generic information extractor.") self.url = match.group(0) # Make sure we have a scheme, or use https if match.group('scheme'): self.scheme = match.group('scheme') else: self.scheme = 'https://' self.url = self.scheme + self.url # Used to resolve relative image urls self.root = self.scheme + match.group('domain') def items(self): """Get page, extract metadata & images, yield them in suitable messages Adapted from common.GalleryExtractor.items() """ page = self.request(self.url).text data = self.metadata(page) imgs = self.images(page) try: data["count"] = len(imgs) except TypeError: pass images = enumerate(imgs, 1) yield Message.Version, 1 yield Message.Directory, data for data["num"], (url, imgdata) in images: if imgdata: data.update(imgdata) if "extension" not in imgdata: text.nameext_from_url(url, data) else: text.nameext_from_url(url, data) yield Message.Url, url, data def metadata(self, page): """Extract generic webpage metadata, return them in a dict.""" data = {} data['pageurl'] = self.url data['title'] = text.extr(page, '', "") data['description'] = text.extr( page, ',