# -*- coding: utf-8 -*- # Copyright 2014, 2015 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from galleries at http://exhentai.org/""" from .common import Extractor, Message from .. import config, text, iso639_1 import time import random class ExhentaiExtractor(Extractor): category = "exhentai" directory_fmt = ["{category}", "{gallery-id}"] filename_fmt = "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}" pattern = [r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] api_url = "http://exhentai.org/api.php" def __init__(self, match): Extractor.__init__(self) self.url = match.group(0) self.version, self.gid, self.token = match.groups() self.session.headers.update({ "User-Agent": "Mozilla/5.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Referer": "http://exhentai.org/", }) cookies = config.get(("extractor", "exhentai", "cookies"), {}) for key, value in cookies.items(): self.session.cookies.set(key, value, domain=".exhentai.org", path="/") self.wait_min = config.interpolate(("extractor", "exhentai", "wait-min"), 3) self.wait_max = config.interpolate(("extractor", "exhentai", "wait-max"), 6) if self.wait_max < self.wait_min: self.wait_max = self.wait_min def items(self): yield Message.Version, 1 page = self.request(self.url).text data, url = self.get_job_metadata(page) headers = self.session.headers.copy() headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5" yield Message.Headers, headers yield Message.Cookies, self.session.cookies yield Message.Directory, data urlkey = "url" if config.interpolate(("extractor", "exhentai", "download-original"), True): urlkey = "origurl" for num, image in enumerate(self.get_images(url), 1): image.update(data) image["num"] = num text.nameext_from_url(image["url"], image) if "/fullimg.php" in image[urlkey]: self.wait((1, 2)) yield Message.Url, image[urlkey], image def get_job_metadata(self, page): """Collect metadata for extractor-job""" data = { "category" : self.category, "gallery-id" : self.gid, "gallery-token": self.token, } data, _ = text.extract_all(page, ( ("title" , '