gallery-dl/gallery_dl/extractor/exhentai.py

# -*- coding: utf-8 -*-

# Copyright 2014-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from galleries at https://exhentai.org/"""

from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import time
import random
import requests


class ExhentaiGalleryExtractor(Extractor):
    """Extractor for image-galleries from exhentai.org"""
    category = "exhentai"
    subcategory = "gallery"
    directory_fmt = ["{category}", "{gallery-id}"]
    filename_fmt = "{gallery-id}_{num:>04}_{image-token}_{name}.{extension}"
    pattern = [r"(?:https?://)?(g\.e-|e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
    test = [
        ("https://exhentai.org/g/960460/4f0e369d82/", {
            "keyword": "623f8c86c9fe38e964682dd4309b96922655b900",
            "content": "493d759de534355c9f55f8e365565b62411de146",
        }),
        ("https://exhentai.org/g/960461/4f0e369d82/", {
            "exception": exception.NotFoundError,
        }),
        ("http://exhentai.org/g/962698/7f02358e00/", {
            "exception": exception.AuthorizationError,
        }),
    ]
    root = "https://exhentai.org"

    def __init__(self, match):
        Extractor.__init__(self)
        self.key = {}
        self.count = 0
        self.version, self.gid, self.token = match.groups()
        self.original = self.config("original", True)
        self.wait_min = self.config("wait-min", 3)
        self.wait_max = self.config("wait-max", 6)
        if self.wait_max < self.wait_min:
            self.wait_max = self.wait_min

    def items(self):
        self.login()
        yield Message.Version, 1
        yield Message.Headers, self.setup_headers()
        yield Message.Cookies, self.session.cookies

        url = "{}/g/{}/{}/".format(self.root, self.gid, self.token)
        response = self.session.get(url)
        page = response.text
        if response.status_code == 404 and "Gallery Not Available" in page:
            raise exception.AuthorizationError()
        if page.startswith(("Key missing", "Gallery not found")):
            raise exception.NotFoundError("gallery")
        data = self.get_job_metadata(page)
        self.count = int(data["count"])
        yield Message.Directory, data

        for url, image in self.get_images(page):
            data.update(image)
            if "/fullimg.php" in url:
                data["extension"] = ""
                self.wait((1, 2))
            yield Message.Url, url, data

    def setup_headers(self):
        """Initialize headers"""
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html,application/xhtml+xml,"
                      "application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": self.root + "/",
        })
        headers = self.session.headers.copy()
        headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5"
        return headers

    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
        data = {
            "gallery-id"   : self.gid,
            "gallery-token": self.token,
        }
        text.extract_all(page, (
            ("title"     , '<h1 id="gn">', '</h1>'),
            ("title_jp"  , '<h1 id="gj">', '</h1>'),
            ("date"      , '>Posted:</td><td class="gdt2">', '</td>'),
            ("language"  , '>Language:</td><td class="gdt2">', ' '),
            ("size"      , '>File Size:</td><td class="gdt2">', ' '),
            ("size-units", '', '<'),
            ("count"     , '>Length:</td><td class="gdt2">', ' '),
        ), values=data)
        data["lang"] = util.language_to_code(data["language"])
        data["title"] = text.unescape(data["title"])
        data["title_jp"] = text.unescape(data["title_jp"])
        return data

    def get_images(self, page):
        """Collect url and metadata for all images in this gallery"""
        part = text.extract(page, 'hentai.org/s/', '"')[0]
        yield self.image_from_page(self.root + "/s/" + part)
        yield from self.images_from_api()

    def image_from_page(self, url):
        """Get image url and data from webpage"""
        self.wait()
        page = self.request(url).text
        data = text.extract_all(page, (
            (None      , '<div id="i3"><a onclick="return load_image(', ''),
            ("nextkey" , "'", "'"),
            ("url"     , '<img id="img" src="', '"'),
            ("origurl" , 'hentai.org/fullimg.php', '"'),
            ("startkey", 'var startkey="', '";'),
            ("showkey" , 'var showkey="', '";'),
        ))[0]
        self.key["start"] = data["startkey"]
        self.key["show"] = data["showkey"]
        self.key["next"] = data["nextkey"]

        if self.original and data["origurl"]:
            part = text.unescape(data["origurl"])
            url = self.root + "/fullimg.php" + part
        else:
            url = data["url"]

        return url, text.nameext_from_url(data["url"], {
            "num": 1,
            "image-token": data["startkey"],
        })

    def images_from_api(self):
        """Get image url and data from api calls"""
        api_url = self.root + "/api.php"
        nextkey = self.key["next"]
        request = {
            "method" : "showpage",
            "gid"    : int(self.gid),
            "imgkey" : nextkey,
            "showkey": self.key["show"],
        }
        for request["page"] in range(2, self.count + 1):
            while True:
                try:
                    self.wait()
                    page = self.session.post(api_url, json=request).json()
                    break
                except requests.exceptions.ConnectionError:
                    pass
            imgkey = nextkey
            nextkey, pos = text.extract(page["i3"], "'", "'")
            imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
            origurl, pos = text.extract(page["i7"], '<a href="', '"')

            if self.original and origurl:
                url = text.unescape(origurl)
            else:
                url = imgurl

            yield url, text.nameext_from_url(imgurl, {
                "num": request["page"],
                "image-token": imgkey
            })
            request["imgkey"] = nextkey

    def wait(self, waittime=None):
        """Wait for a randomly chosen amount of seconds"""
        if not waittime:
            waittime = random.uniform(self.wait_min, self.wait_max)
        else:
            waittime = random.uniform(*waittime)
        time.sleep(waittime)

    def login(self):
        """Login and set necessary cookies"""
        username, password = self.auth_info()
        if not username:
            self.log.info("no username given; using e-hentai.org")
            self.root = "https://e-hentai.org"
            self.original = False
            return
        cookies = self._login_impl(username, password)
        for key, value in cookies.items():
            self.session.cookies.set(
                key, value, domain=".exhentai.org", path="/")

    @cache(maxage=90*24*60*60, keyarg=1)
    def _login_impl(self, username, password):
        """Actual login implementation"""
        self.log.info("Logging in as %s", username)
        cnames = ["ipb_member_id", "ipb_pass_hash"]

        try:
            cookies = self.config("cookies")
            if isinstance(cookies, dict) and all(c in cookies for c in cnames):
                return cookies
        except TypeError:
            pass

        url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
        params = {
            "CookieDate": "1",
            "b": "d",
            "bt": "1-1",
            "UserName": username,
            "PassWord": password,
            "ipb_login_submit": "Login!",
        }
        referer = "https://e-hentai.org/bounce_login.php?b=d&bt=1-1"
        self.session.headers["Referer"] = referer
        response = self.session.post(url, data=params)

        if "You are now logged in as:" not in response.text:
            raise exception.AuthenticationError()
        return {c: response.cookies[c] for c in cnames}
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`# -- coding: utf-8 --`

add login notifications 2017-03-17 09:42:59 +01:00			`# Copyright 2014-2017 Mike Fährmann`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`"""Extract images from galleries at https://exhentai.org/"""`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00
			`from .common import Extractor, Message`
implement and use extractor.config() method 2017-04-25 17:12:48 +02:00			`from .. import text, util, exception`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`from ..cache import cache`
initial commit 2014-10-12 21:56:44 +02:00			`import time`
			`import random`
[exhentai] retry failed api calls 2016-10-11 13:27:19 +02:00			`import requests`
initial commit 2014-10-12 21:56:44 +02:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`class ExhentaiGalleryExtractor(Extractor):`
			`"""Extractor for image-galleries from exhentai.org"""`
update all other extractors 2015-11-21 04:26:30 +01:00			`category = "exhentai"`
consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`subcategory = "gallery"`
update all other extractors 2015-11-21 04:26:30 +01:00			`directory_fmt = ["{category}", "{gallery-id}"]`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`filename_fmt = "{gallery-id}_{num:>04}_{image-token}_{name}.{extension}"`
[exhentai] accept "e-hentai.org" URLs (#11) 2017-04-04 09:30:35 +02:00			`pattern = [r"(?:https?://)?(g\.e-\|e-\|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]`
add a few more tests 2016-12-31 00:51:06 +01:00			`test = [`
			`("https://exhentai.org/g/960460/4f0e369d82/", {`
			`"keyword": "623f8c86c9fe38e964682dd4309b96922655b900",`
			`"content": "493d759de534355c9f55f8e365565b62411de146",`
			`}),`
			`("https://exhentai.org/g/960461/4f0e369d82/", {`
			`"exception": exception.NotFoundError,`
			`}),`
			`("http://exhentai.org/g/962698/7f02358e00/", {`
			`"exception": exception.AuthorizationError,`
			`}),`
			`]`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`root = "https://exhentai.org"`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def __init__(self, match):`
			`Extractor.__init__(self)`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`self.key = {}`
[exhentai] use image-count as stop signal 2016-10-12 15:19:31 +02:00			`self.count = 0`
[exhentai] accept "e-hentai.org" URLs (#11) 2017-04-04 09:30:35 +02:00			`self.version, self.gid, self.token = match.groups()`
implement and use extractor.config() method 2017-04-25 17:12:48 +02:00			`self.original = self.config("original", True)`
			`self.wait_min = self.config("wait-min", 3)`
			`self.wait_max = self.config("wait-max", 6)`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`if self.wait_max < self.wait_min:`
			`self.wait_max = self.wait_min`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def items(self):`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`self.login()`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`yield Message.Version, 1`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`yield Message.Headers, self.setup_headers()`
			`yield Message.Cookies, self.session.cookies`

[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`url = "{}/g/{}/{}/".format(self.root, self.gid, self.token)`
[exhentai] raise proper exception for 'unavailable' galleries 2016-12-22 12:42:41 +01:00			`response = self.session.get(url)`
			`page = response.text`
			`if response.status_code == 404 and "Gallery Not Available" in page:`
			`raise exception.AuthorizationError()`
[exhentai] fix detection of invalid gallery keys 2017-02-15 03:36:46 +01:00			`if page.startswith(("Key missing", "Gallery not found")):`
[exhentai] transition to https 2016-08-30 09:17:40 +02:00			`raise exception.NotFoundError("gallery")`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`data = self.get_job_metadata(page)`
[exhentai] use image-count as stop signal 2016-10-12 15:19:31 +02:00			`self.count = int(data["count"])`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`yield Message.Directory, data`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`for url, image in self.get_images(page):`
			`data.update(image)`
[exhentai] metadata consistency 2016-09-19 16:13:26 +02:00			`if "/fullimg.php" in url:`
allow extension by Content-Type for exhentai, seiga, senmanga 2016-09-30 16:43:43 +02:00			`data["extension"] = ""`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`self.wait((1, 2))`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`yield Message.Url, url, data`

			`def setup_headers(self):`
			`"""Initialize headers"""`
			`self.session.headers.update({`
			`"User-Agent": "Mozilla/5.0",`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`"Accept": "text/html,application/xhtml+xml,"`
			`"application/xml;q=0.9,/;q=0.8",`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`"Accept-Language": "en-US,en;q=0.5",`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`"Referer": self.root + "/",`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`})`
			`headers = self.session.headers.copy()`
			`headers["Accept"] = "image/png,image/;q=0.8,/*;q=0.5"`
			`return headers`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`def get_job_metadata(self, page):`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`"""Collect metadata for extractor-job"""`
			`data = {`
			`"gallery-id" : self.gid,`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`"gallery-token": self.token,`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`}`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`text.extract_all(page, (`
			`("title" , '<h1 id="gn">', '</h1>'),`
			`("title_jp" , '<h1 id="gj">', '</h1>'),`
			`("date" , '>Posted:</td><td class="gdt2">', '</td>'),`
			`("language" , '>Language:</td><td class="gdt2">', ' '),`
			`("size" , '>File Size:</td><td class="gdt2">', ' '),`
			`("size-units", '', '<'),`
			`("count" , '>Length:</td><td class="gdt2">', ' '),`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`), values=data)`
move code into util.py 2017-03-28 13:12:44 +02:00			`data["lang"] = util.language_to_code(data["language"])`
[exhentai] unescape title 2016-08-31 10:20:46 +02:00			`data["title"] = text.unescape(data["title"])`
			`data["title_jp"] = text.unescape(data["title_jp"])`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`return data`
initial commit 2014-10-12 21:56:44 +02:00
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`def get_images(self, page):`
[exhentai] use text.extract_all 2015-11-03 00:10:30 +01:00			`"""Collect url and metadata for all images in this gallery"""`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`part = text.extract(page, 'hentai.org/s/', '"')[0]`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`yield self.image_from_page(self.root + "/s/" + part)`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`yield from self.images_from_api()`

			`def image_from_page(self, url):`
			`"""Get image url and data from webpage"""`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00			`self.wait()`
[exhentai] reenable extractor 2015-10-31 16:50:20 +01:00			`page = self.request(url).text`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`data = text.extract_all(page, (`
			`(None , '<div id="i3"><a onclick="return load_image(', ''),`
			`("nextkey" , "'", "'"),`
			`("url" , '<img id="img" src="', '"'),`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`("origurl" , 'hentai.org/fullimg.php', '"'),`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`("startkey", 'var startkey="', '";'),`
			`("showkey" , 'var showkey="', '";'),`
			`))[0]`
			`self.key["start"] = data["startkey"]`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`self.key["show"] = data["showkey"]`
			`self.key["next"] = data["nextkey"]`

			`if self.original and data["origurl"]:`
			`part = text.unescape(data["origurl"])`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`url = self.root + "/fullimg.php" + part`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`else:`
			`url = data["url"]`

[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`return url, text.nameext_from_url(data["url"], {`
			`"num": 1,`
			`"image-token": data["startkey"],`
			`})`
fixed various bugs - forgot "self." before "name_fmt" - image keys where off by one 2014-10-15 16:17:59 +02:00
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`def images_from_api(self):`
			`"""Get image url and data from api calls"""`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`api_url = self.root + "/api.php"`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`nextkey = self.key["next"]`
initial commit 2014-10-12 21:56:44 +02:00			`request = {`
			`"method" : "showpage",`
[exhentai] metadata consistency 2016-09-19 16:13:26 +02:00			`"gid" : int(self.gid),`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`"imgkey" : nextkey,`
			`"showkey": self.key["show"],`
initial commit 2014-10-12 21:56:44 +02:00			`}`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`for request["page"] in range(2, self.count + 1):`
[exhentai] retry failed api calls 2016-10-11 13:27:19 +02:00			`while True:`
			`try:`
[exhentai] use image-count as stop signal 2016-10-12 15:19:31 +02:00			`self.wait()`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`page = self.session.post(api_url, json=request).json()`
[exhentai] retry failed api calls 2016-10-11 13:27:19 +02:00			`break`
[exhentai] use image-count as stop signal 2016-10-12 15:19:31 +02:00			`except requests.exceptions.ConnectionError:`
			`pass`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`imgkey = nextkey`
			`nextkey, pos = text.extract(page["i3"], "'", "'")`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)`
[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`origurl, pos = text.extract(page["i7"], '<a href="', '"')`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
			`if self.original and origurl:`
			`url = text.unescape(origurl)`
			`else:`
			`url = imgurl`

[exhentai] rewrite 2016-09-20 19:01:16 +02:00			`yield url, text.nameext_from_url(imgurl, {`
			`"num": request["page"],`
			`"image-token": imgkey`
			`})`
			`request["imgkey"] = nextkey`
[exhentai] configurable wait-times 2015-11-19 17:04:54 +01:00
			`def wait(self, waittime=None):`
			`"""Wait for a randomly chosen amount of seconds"""`
			`if not waittime:`
			`waittime = random.uniform(self.wait_min, self.wait_max)`
			`else:`
			`waittime = random.uniform(*waittime)`
			`time.sleep(waittime)`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00
			`def login(self):`
			`"""Login and set necessary cookies"""`
add .netrc support (#22) Use the '--netrc' cmdline option or set the 'netrc' config option to 'true' to enable the use of .netrc authentication data. The 'machine' names for the .netrc info are the lowercase extractor names (or categories): batoto, exhentai, nijie, pixiv, seiga. 2017-06-24 12:17:26 +02:00			`username, password = self.auth_info()`
[exhentai] fall back to e-hentai if no username is given 2017-04-28 15:59:56 +02:00			`if not username:`
			`self.log.info("no username given; using e-hentai.org")`
			`self.root = "https://e-hentai.org"`
			`self.original = False`
			`return`
adjust login methods to a specific style 2017-01-08 17:33:25 +01:00			`cookies = self._login_impl(username, password)`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`for key, value in cookies.items():`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`self.session.cookies.set(`
			`key, value, domain=".exhentai.org", path="/")`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00
add login notifications 2017-03-17 09:42:59 +01:00			`@cache(maxage=902460*60, keyarg=1)`
adjust login methods to a specific style 2017-01-08 17:33:25 +01:00			`def _login_impl(self, username, password):`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`"""Actual login implementation"""`
add login notifications 2017-03-17 09:42:59 +01:00			`self.log.info("Logging in as %s", username)`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`cnames = ["ipb_member_id", "ipb_pass_hash"]`

			`try:`
implement and use extractor.config() method 2017-04-25 17:12:48 +02:00			`cookies = self.config("cookies")`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`if isinstance(cookies, dict) and all(c in cookies for c in cnames):`
			`return cookies`
			`except TypeError:`
			`pass`

			`url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"`
			`params = {`
			`"CookieDate": "1",`
			`"b": "d",`
			`"bt": "1-1",`
adjust login methods to a specific style 2017-01-08 17:33:25 +01:00			`"UserName": username,`
			`"PassWord": password,`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`"ipb_login_submit": "Login!",`
			`}`
implement and use extractor.config() method 2017-04-25 17:12:48 +02:00			`referer = "https://e-hentai.org/bounce_login.php?b=d&bt=1-1"`
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00			`self.session.headers["Referer"] = referer`
[exhentai] provide username/password auth 2016-07-23 17:55:46 +02:00			`response = self.session.post(url, data=params)`

			`if "You are now logged in as:" not in response.text:`
			`raise exception.AuthenticationError()`
			`return {c: response.cookies[c] for c in cnames}`