gallery-dl/gallery_dl/extractor/exhentai.py

# -*- coding: utf-8 -*-

# Copyright 2014-2016 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from galleries at https://exhentai.org/"""

from .common import Extractor, Message
from .. import config, text, iso639_1, exception
from ..cache import cache
import time
import random

class ExhentaiGalleryExtractor(Extractor):
    """Extractor for image-galleries from exhentai.org"""
    category = "exhentai"
    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery-id}"]
    filename_fmt = "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}"
    pattern = [r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
    test = [("https://exhentai.org/g/960460/4f0e369d82/", {
        "keyword": "aaac45cad1897a9815384bc3a743ce7502c692f6",
        "content": "493d759de534355c9f55f8e365565b62411de146",
    })]
    api_url = "https://exhentai.org/api.php"

    def __init__(self, match):
        Extractor.__init__(self)
        self.url = match.group(0)
        self.version, self.gid, self.token = match.groups()
        self.login()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://exhentai.org/",
        })
        self.wait_min = config.interpolate(("extractor", "exhentai", "wait-min"), 3)
        self.wait_max = config.interpolate(("extractor", "exhentai", "wait-max"), 6)
        if self.wait_max < self.wait_min:
            self.wait_max = self.wait_min

    def items(self):
        yield Message.Version, 1
        page = self.request(self.url).text
        if page.startswith("Key missing") \
        or page.startswith("Gallery not found"):
            raise exception.NotFoundError("gallery")
        data, url = self.get_job_metadata(page)

        headers = self.session.headers.copy()
        headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5"
        yield Message.Headers, headers
        yield Message.Cookies, self.session.cookies
        yield Message.Directory, data

        urlkey = "url"
        if config.interpolate(("extractor", "exhentai", "download-original"), True):
            urlkey = "origurl"
        for num, image in enumerate(self.get_images(url), 1):
            image.update(data)
            image["num"] = num
            text.nameext_from_url(image["url"], image)
            url = image[urlkey]
            del image["url"]
            del image["origurl"]
            if "/fullimg.php" in url:
                self.wait((1, 2))
            yield Message.Url, url, image

    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
        data = {
            "category"     : self.category,
            "gallery-id"   : self.gid,
            "gallery-token": self.token,
        }
        data, _ = text.extract_all(page, (
            ("title"   , '<h1 id="gn">', '</h1>'),
            ("title_jp", '<h1 id="gj">', '</h1>'),
            ("date"    , '>Posted:</td><td class="gdt2">', '</td>'),
            ("language", '>Language:</td><td class="gdt2">', '</td>'),
            ("size"    , '>File Size:</td><td class="gdt2">', ' '),
            ("count"   , '>Length:</td><td class="gdt2">', ' '),
            ("url"     , 'hentai.org/s/', '"'),
        ), values=data)
        pos = data["language"].find(" ")
        if pos != -1:
            data["language"] = data["language"][:pos]
        data["lang"] = iso639_1.language_to_code(data["language"])
        data["title"] = text.unescape(data["title"])
        data["title_jp"] = text.unescape(data["title_jp"])
        url = "https://exhentai.org/s/" + data["url"]
        del data["url"]
        return data, url

    def get_images(self, url):
        """Collect url and metadata for all images in this gallery"""
        self.wait()
        page = self.request(url).text
        data, pos = text.extract_all(page, (
            (None         , '<div id="i3"><a onclick="return load_image(', ''),
            ("imgkey-next", "'", "'"),
            ("url"        , '<img id="img" src="', '"'),
            ("title"      , '<div id="i4"><div>', ' :: '),
            ("origurl"    , 'https://exhentai.org/fullimg.php', '"'),
            ("startkey"   , 'var startkey="', '";'),
            ("showkey"    , 'var showkey="', '";'),
        ))
        data["imgkey"] = data["startkey"]

        request = {
            "method" : "showpage",
            "page"   : 2,
            "gid"    : int(self.gid),
            "imgkey" : data["imgkey-next"],
            "showkey": data["showkey"],
        }
        del data["showkey"]

        if data["origurl"]:
            data["origurl"] = "https://exhentai.org/fullimg.php" + text.unescape(data["origurl"])
        else:
            data["origurl"] = data["url"]
        yield data

        while True:
            if data["imgkey"] == data["imgkey-next"]:
                return
            self.wait()
            page = self.session.post(self.api_url, json=request).json()
            data["imgkey"] = data["imgkey-next"]
            data["imgkey-next"], pos = text.extract(page["i3"], "'", "'")
            data["url"]        , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos)
            data["title"]      , pos = text.extract(page["i" ], '<div>', ' :: ')
            data["origurl"]    , pos = text.extract(page["i7"], '<a href="', '"')
            if data["origurl"]:
                data["origurl"] = text.unescape(data["origurl"])
            else:
                data["origurl"] = data["url"]
            yield data
            request["imgkey"] = data["imgkey-next"]
            request["page"] += 1

    def wait(self, waittime=None):
        """Wait for a randomly chosen amount of seconds"""
        if not waittime:
            waittime = random.uniform(self.wait_min, self.wait_max)
        else:
            waittime = random.uniform(*waittime)
        time.sleep(waittime)

    def login(self):
        """Login and set necessary cookies"""
        cookies = self._login_impl()
        for key, value in cookies.items():
            self.session.cookies.set(key, value, domain=".exhentai.org", path="/")

    @cache(maxage=360*24*60*60)
    def _login_impl(self):
        """Actual login implementation"""
        cnames = ["ipb_member_id", "ipb_pass_hash"]

        try:
            cookies = config.get(("extractor", "exhentai", "cookies"))
            if isinstance(cookies, dict) and all(c in cookies for c in cnames):
                return cookies
        except TypeError:
            pass

        url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
        params = {
            "CookieDate": "1",
            "b": "d",
            "bt": "1-1",
            "UserName": config.interpolate(("extractor", "exhentai", "username")),
            "PassWord": config.interpolate(("extractor", "exhentai", "password")),
            "ipb_login_submit": "Login!",
        }
        self.session.headers["Referer"] = "http://e-hentai.org/bounce_login.php?b=d&bt=1-1"
        response = self.session.post(url, data=params)

        if "You are now logged in as:" not in response.text:
            raise exception.AuthenticationError()
        return {c: response.cookies[c] for c in cnames}
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`# -- coding: utf-8 --`

consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`# Copyright 2014-2016 Mike Fährmann`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`"""Extract images from galleries at https://exhentai.org/"""`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00
			`from .common import Extractor, Message`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`from .. import config, text, iso639_1, exception`
			`from ..cache import cache`
initial commit 2014-10-12 21:56:44 +02:00			`import time`
			`import random`

consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`class ExhentaiGalleryExtractor(Extractor):`
			`"""Extractor for image-galleries from exhentai.org"""`
update all other extractors 2015-11-21 04:26:30 +01:00			`category = "exhentai"`
consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`subcategory = "gallery"`
update all other extractors 2015-11-21 04:26:30 +01:00			`directory_fmt = ["{category}", "{gallery-id}"]`
			`filename_fmt = "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}"`
			`pattern = [r"(?:https?://)?(g\.e-\|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]`
add missing tests 2016-09-19 16:15:27 +02:00			`test = [("https://exhentai.org/g/960460/4f0e369d82/", {`
			`"keyword": "aaac45cad1897a9815384bc3a743ce7502c692f6",`
			`"content": "493d759de534355c9f55f8e365565b62411de146",`
			`})]`
[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`api_url = "https://exhentai.org/api.php"`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def __init__(self, match):`
			`Extractor.__init__(self)`
initial commit 2014-10-12 21:56:44 +02:00			`self.url = match.group(0)`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`self.version, self.gid, self.token = match.groups()`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`self.login()`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`self.session.headers.update({`
			`"User-Agent": "Mozilla/5.0",`
			`"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",`
			`"Accept-Language": "en-US,en;q=0.5",`
[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`"Referer": "https://exhentai.org/",`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`})`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`self.wait_min = config.interpolate(("extractor", "exhentai", "wait-min"), 3)`
			`self.wait_max = config.interpolate(("extractor", "exhentai", "wait-max"), 6)`
			`if self.wait_max < self.wait_min:`
			`self.wait_max = self.wait_min`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def items(self):`
			`yield Message.Version, 1`
			`page = self.request(self.url).text`
[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`if page.startswith("Key missing") \`
			`or page.startswith("Gallery not found"):`
			`raise exception.NotFoundError("gallery")`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`data, url = self.get_job_metadata(page)`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`headers = self.session.headers.copy()`
			`headers["Accept"] = "image/png,image/;q=0.8,/*;q=0.5"`
			`yield Message.Headers, headers`
			`yield Message.Cookies, self.session.cookies`
			`yield Message.Directory, data`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`urlkey = "url"`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`if config.interpolate(("extractor", "exhentai", "download-original"), True):`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`urlkey = "origurl"`
			`for num, image in enumerate(self.get_images(url), 1):`
			`image.update(data)`
			`image["num"] = num`
code cleanup to use nameext_from_url 2015-11-16 17:32:26 +01:00			`text.nameext_from_url(image["url"], image)`
[exhentai] metadata consistency 2016-09-19 16:13:26 +02:00			`url = image[urlkey]`
			`del image["url"]`
			`del image["origurl"]`
			`if "/fullimg.php" in url:`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`self.wait((1, 2))`
[exhentai] metadata consistency 2016-09-19 16:13:26 +02:00			`yield Message.Url, url, image`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def get_job_metadata(self, page):`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`"""Collect metadata for extractor-job"""`
			`data = {`
update all other extractors 2015-11-21 04:26:30 +01:00			`"category" : self.category,`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`"gallery-id" : self.gid,`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`"gallery-token": self.token,`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`}`
			`data, _ = text.extract_all(page, (`
			`("title" , '<h1 id="gn">', '</h1>'),`
			`("title_jp", '<h1 id="gj">', '</h1>'),`
			`("date" , '>Posted:</td><td class="gdt2">', '</td>'),`
			`("language", '>Language:</td><td class="gdt2">', '</td>'),`
			`("size" , '>File Size:</td><td class="gdt2">', ' '),`
			`("count" , '>Length:</td><td class="gdt2">', ' '),`
			`("url" , 'hentai.org/s/', '"'),`
			`), values=data)`
[exhentai] better language keywords 2015-11-17 20:51:06 +01:00			`pos = data["language"].find(" ")`
			`if pos != -1:`
			`data["language"] = data["language"][:pos]`
			`data["lang"] = iso639_1.language_to_code(data["language"])`
[exhentai] unescape title 2016-08-31 10:20:46 +02:00			`data["title"] = text.unescape(data["title"])`
			`data["title_jp"] = text.unescape(data["title_jp"])`
[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`url = "https://exhentai.org/s/" + data["url"]`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`del data["url"]`
			`return data, url`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def get_images(self, url):`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`"""Collect url and metadata for all images in this gallery"""`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`self.wait()`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`page = self.request(url).text`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`data, pos = text.extract_all(page, (`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`(None , '<div id="i3"><a onclick="return load_image(', ''),`
			`("imgkey-next", "'", "'"),`
			`("url" , '<img id="img" src="', '"'),`
			`("title" , '<div id="i4"><div>', ' :: '),`
[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`("origurl" , 'https://exhentai.org/fullimg.php', '"'),`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`("startkey" , 'var startkey="', '";'),`
			`("showkey" , 'var showkey="', '";'),`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`))`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`data["imgkey"] = data["startkey"]`
fixed various bugs - forgot "self." before "name_fmt" - image keys where off by one 2014-10-15 16:17:59 +02:00
initial commit 2014-10-12 21:56:44 +02:00			`request = {`
			`"method" : "showpage",`
			`"page" : 2,`
[exhentai] metadata consistency 2016-09-19 16:13:26 +02:00			`"gid" : int(self.gid),`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`"imgkey" : data["imgkey-next"],`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`"showkey": data["showkey"],`
initial commit 2014-10-12 21:56:44 +02:00			`}`
[exhentai] metadata consistency 2016-09-19 16:13:26 +02:00			`del data["showkey"]`

			`if data["origurl"]:`
			`data["origurl"] = "https://exhentai.org/fullimg.php" + text.unescape(data["origurl"])`
			`else:`
			`data["origurl"] = data["url"]`
			`yield data`

initial commit 2014-10-12 21:56:44 +02:00			`while True:`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`if data["imgkey"] == data["imgkey-next"]:`
			`return`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`self.wait()`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`page = self.session.post(self.api_url, json=request).json()`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`data["imgkey"] = data["imgkey-next"]`
			`data["imgkey-next"], pos = text.extract(page["i3"], "'", "'")`
			`data["url"] , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos)`
			`data["title"] , pos = text.extract(page["i" ], '<div>', ' :: ')`
			`data["origurl"] , pos = text.extract(page["i7"], '<a href="', '"')`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`if data["origurl"]:`
			`data["origurl"] = text.unescape(data["origurl"])`
			`else:`
			`data["origurl"] = data["url"]`
			`yield data`
[exhentai] fix 'imgkey' handling 2015-11-18 14:10:42 +01:00			`request["imgkey"] = data["imgkey-next"]`
initial commit 2014-10-12 21:56:44 +02:00			`request["page"] += 1`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00
			`def wait(self, waittime=None):`
			`"""Wait for a randomly chosen amount of seconds"""`
			`if not waittime:`
			`waittime = random.uniform(self.wait_min, self.wait_max)`
			`else:`
			`waittime = random.uniform(*waittime)`
			`time.sleep(waittime)`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00
			`def login(self):`
			`"""Login and set necessary cookies"""`
			`cookies = self._login_impl()`
			`for key, value in cookies.items():`
			`self.session.cookies.set(key, value, domain=".exhentai.org", path="/")`

			`@cache(maxage=3602460*60)`
			`def _login_impl(self):`
			`"""Actual login implementation"""`
			`cnames = ["ipb_member_id", "ipb_pass_hash"]`

			`try:`
			`cookies = config.get(("extractor", "exhentai", "cookies"))`
			`if isinstance(cookies, dict) and all(c in cookies for c in cnames):`
			`return cookies`
			`except TypeError:`
			`pass`

			`url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"`
			`params = {`
			`"CookieDate": "1",`
			`"b": "d",`
			`"bt": "1-1",`
			`"UserName": config.interpolate(("extractor", "exhentai", "username")),`
			`"PassWord": config.interpolate(("extractor", "exhentai", "password")),`
			`"ipb_login_submit": "Login!",`
			`}`
			`self.session.headers["Referer"] = "http://e-hentai.org/bounce_login.php?b=d&bt=1-1"`
			`response = self.session.post(url, data=params)`

			`if "You are now logged in as:" not in response.text:`
			`raise exception.AuthenticationError()`
			`return {c: response.cookies[c] for c in cnames}`