gallery-dl/gallery_dl/extractor/poipiku.py

# -*- coding: utf-8 -*-

# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://poipiku.com/"""

from .common import Extractor, Message
from .. import text

BASE_PATTERN = r"(?:https?://)?poipiku\.com"


class PoipikuExtractor(Extractor):
    """Base class for poipiku extractors"""
    category = "poipiku"
    root = "https://poipiku.com"
    directory_fmt = ("{category}", "{user_id} {user_name}")
    filename_fmt = "{post_id}_{num}.{extension}"
    archive_fmt = "{post_id}_{num}"
    request_interval = (0.5, 1.5)

    def _init(self):
        self.cookies.set(
            "LANG", "en", domain="poipiku.com")
        self.cookies.set(
            "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com")

    def items(self):
        password = self.config("password", "")

        for post_url in self.posts():
            parts = post_url.split("/")
            if post_url[0] == "/":
                post_url = self.root + post_url
            page = self.request(post_url).text
            extr = text.extract_from(page)

            post = {
                "post_category": extr("<title>[", "]"),
                "count"      : extr("(", " "),
                "post_id"    : parts[-1].partition(".")[0],
                "user_id"    : parts[-2],
                "user_name"  : text.unescape(extr(
                    '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
                "description": text.unescape(extr(
                    'class="IllustItemDesc" >', '</h1>')),
                "_http_headers": {"Referer": post_url},
            }

            yield Message.Directory, post
            post["num"] = 0

            while True:
                thumb = extr('class="IllustItemThumbImg" src="', '"')
                if not thumb:
                    break
                elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):
                    continue
                post["num"] += 1
                url = text.ensure_http_scheme(thumb[:-8]).replace(
                    "//img.", "//img-org.", 1)
                yield Message.Url, url, text.nameext_from_url(url, post)

            if not extr('ShowAppendFile', '<'):
                continue

            url = self.root + "/f/ShowAppendFileF.jsp"
            headers = {
                "Accept" : "application/json, text/javascript, */*; q=0.01",
                "X-Requested-With": "XMLHttpRequest",
                "Origin" : self.root,
                "Referer": post_url,
            }
            data = {
                "UID": post["user_id"],
                "IID": post["post_id"],
                "PAS": password,
                "MD" : "0",
                "TWF": "-1",
            }
            resp = self.request(
                url, method="POST", headers=headers, data=data).json()

            page = resp["html"]
            if (resp.get("result_num") or 0) < 0:
                self.log.warning("'%s'", page.replace("<br/>", " "))

            for thumb in text.extract_iter(
                    page, 'class="IllustItemThumbImg" src="', '"'):
                post["num"] += 1
                url = text.ensure_http_scheme(thumb[:-8]).replace(
                    "//img.", "//img-org.", 1)
                yield Message.Url, url, text.nameext_from_url(url, post)


class PoipikuUserExtractor(PoipikuExtractor):
    """Extractor for posts from a poipiku user"""
    subcategory = "user"
    pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?"
               r"(\d+)/?(?:$|[?&#])")
    example = "https://poipiku.com/12345/"

    def __init__(self, match):
        PoipikuExtractor.__init__(self, match)
        self._page, self.user_id = match.groups()

    def posts(self):
        url = self.root + "/IllustListPcV.jsp"
        params = {
            "PG" : text.parse_int(self._page, 0),
            "ID" : self.user_id,
            "KWD": "",
        }

        while True:
            page = self.request(url, params=params).text

            cnt = 0
            for path in text.extract_iter(
                    page, 'class="IllustInfo" href="', '"'):
                yield path
                cnt += 1

            if cnt < 48:
                return
            params["PG"] += 1


class PoipikuPostExtractor(PoipikuExtractor):
    """Extractor for a poipiku post"""
    subcategory = "post"
    pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
    example = "https://poipiku.com/12345/12345.html"

    def __init__(self, match):
        PoipikuExtractor.__init__(self, match)
        self.user_id, self.post_id = match.groups()

    def posts(self):
        return ("/{}/{}.html".format(self.user_id, self.post_id),)
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`# -- coding: utf-8 --`

[poipiku] warn about login requirements 2023-01-05 12:29:31 +01:00			`# Copyright 2022-2023 Mike Fährmann`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://poipiku.com/"""`

			`from .common import Extractor, Message`
			`from .. import text`

			`BASE_PATTERN = r"(?:https?://)?poipiku\.com"`


			`class PoipikuExtractor(Extractor):`
			`"""Base class for poipiku extractors"""`
			`category = "poipiku"`
			`root = "https://poipiku.com"`
			`directory_fmt = ("{category}", "{user_id} {user_name}")`
			`filename_fmt = "{post_id}_{num}.{extension}"`
			`archive_fmt = "{post_id}_{num}"`
			`request_interval = (0.5, 1.5)`

[poipiku] fix downloading R-18 posts (#5567) … by automatically sending a `POIPIKU_CONTENTS_VIEW_MODE=1` cookie to enable "adult" mode. 2024-05-09 15:14:08 +02:00			`def _init(self):`
[poipiku] send LANG cookie to ensure English UI (#5590) 2024-05-17 02:50:31 +02:00			`self.cookies.set(`
			`"LANG", "en", domain="poipiku.com")`
[poipiku] fix downloading R-18 posts (#5567) … by automatically sending a `POIPIKU_CONTENTS_VIEW_MODE=1` cookie to enable "adult" mode. 2024-05-09 15:14:08 +02:00			`self.cookies.set(`
			`"POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com")`

[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`def items(self):`
[poipiku] add simple password support (#1602) 2022-06-22 18:21:01 +02:00			`password = self.config("password", "")`

[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`for post_url in self.posts():`
			`parts = post_url.split("/")`
			`if post_url[0] == "/":`
			`post_url = self.root + post_url`
			`page = self.request(post_url).text`
			`extr = text.extract_from(page)`

			`post = {`
			`"post_category": extr("<title>[", "]"),`
			`"count" : extr("(", " "),`
			`"post_id" : parts[-1].partition(".")[0],`
			`"user_id" : parts[-2],`
			`"user_name" : text.unescape(extr(`
			`'<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),`
			`"description": text.unescape(extr(`
[poipiku] extract full 'descriptions' (#4066) don't cut it off after the first line 2023-05-15 23:16:31 +02:00			`'class="IllustItemDesc" >', '</h1>')),`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2022-09-12 11:21:01 +02:00			`"_http_headers": {"Referer": post_url},`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`}`

			`yield Message.Directory, post`
			`post["num"] = 0`

			`while True:`
			`thumb = extr('class="IllustItemThumbImg" src="', '"')`
			`if not thumb:`
			`break`
[poipiku] update filter for static images (#2796) 2022-08-01 12:36:19 +02:00			`elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`continue`
			`post["num"] += 1`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2022-09-12 11:21:01 +02:00			`url = text.ensure_http_scheme(thumb[:-8]).replace(`
			`"//img.", "//img-org.", 1)`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`yield Message.Url, url, text.nameext_from_url(url, post)`

[poipoku] avoid language-specific extr 2024-05-13 20:42:02 +02:00			`if not extr('ShowAppendFile', '<'):`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`continue`

			`url = self.root + "/f/ShowAppendFileF.jsp"`
			`headers = {`
			`"Accept" : "application/json, text/javascript, /; q=0.01",`
			`"X-Requested-With": "XMLHttpRequest",`
			`"Origin" : self.root,`
			`"Referer": post_url,`
			`}`
			`data = {`
			`"UID": post["user_id"],`
			`"IID": post["post_id"],`
[poipiku] add simple password support (#1602) 2022-06-22 18:21:01 +02:00			`"PAS": password,`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`"MD" : "0",`
			`"TWF": "-1",`
			`}`
[poipiku] improve error detection (#4206) 2023-06-27 21:45:44 +02:00			`resp = self.request(`
			`url, method="POST", headers=headers, data=data).json()`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00
[poipiku] improve error detection (#4206) 2023-06-27 21:45:44 +02:00			`page = resp["html"]`
			`if (resp.get("result_num") or 0) < 0:`
			`self.log.warning("'%s'", page.replace("<br/>", " "))`
[poipiku] warn about login requirements 2023-01-05 12:29:31 +01:00
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`for thumb in text.extract_iter(`
			`page, 'class="IllustItemThumbImg" src="', '"'):`
			`post["num"] += 1`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2022-09-12 11:21:01 +02:00			`url = text.ensure_http_scheme(thumb[:-8]).replace(`
			`"//img.", "//img-org.", 1)`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00			`yield Message.Url, url, text.nameext_from_url(url, post)`


			`class PoipikuUserExtractor(PoipikuExtractor):`
			`"""Extractor for posts from a poipiku user"""`
			`subcategory = "user"`
			`pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?"`
			`r"(\d+)/?(?:$\|[?&#])")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://poipiku.com/12345/"`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00
			`def __init__(self, match):`
			`PoipikuExtractor.__init__(self, match)`
			`self._page, self.user_id = match.groups()`

			`def posts(self):`
			`url = self.root + "/IllustListPcV.jsp"`
			`params = {`
			`"PG" : text.parse_int(self._page, 0),`
			`"ID" : self.user_id,`
			`"KWD": "",`
			`}`

			`while True:`
			`page = self.request(url, params=params).text`

			`cnt = 0`
			`for path in text.extract_iter(`
			`page, 'class="IllustInfo" href="', '"'):`
			`yield path`
			`cnt += 1`

			`if cnt < 48:`
			`return`
			`params["PG"] += 1`


			`class PoipikuPostExtractor(PoipikuExtractor):`
			`"""Extractor for a poipiku post"""`
			`subcategory = "post"`
			`pattern = BASE_PATTERN + r"/(\d+)/(\d+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://poipiku.com/12345/12345.html"`
[poipiku] add 'user' and 'post' extractors (#1602) 2022-06-20 11:25:42 +02:00
			`def __init__(self, match):`
			`PoipikuExtractor.__init__(self, match)`
			`self.user_id, self.post_id = match.groups()`

			`def posts(self):`
			`return ("/{}/{}.html".format(self.user_id, self.post_id),)`