From 27e8078fb7c555cc4a9ca1492e76a6e3947ff64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 20 Jun 2022 11:25:42 +0200 Subject: [PATCH] [poipiku] add 'user' and 'post' extractors (#1602) --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/poipiku.py | 167 +++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 gallery_dl/extractor/poipiku.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index cee47248..4819f570 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -631,6 +631,12 @@ Consider all sites to be NSFW unless otherwise known. Posts, Timelines + + Poipiku + https://poipiku.com/ + Posts, User Profiles + + Porn Image https://porn-images-xxx.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6028b343..e273f843 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -107,6 +107,7 @@ modules = [ "pixiv", "pixnet", "plurk", + "poipiku", "pornhub", "pururin", "reactor", diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py new file mode 100644 index 00000000..3b0cc7a0 --- /dev/null +++ b/gallery_dl/extractor/poipiku.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://poipiku.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?poipiku\.com" + + +class PoipikuExtractor(Extractor): + """Base class for poipiku extractors""" + category = "poipiku" + root = "https://poipiku.com" + directory_fmt = ("{category}", "{user_id} {user_name}") + filename_fmt = "{post_id}_{num}.{extension}" + archive_fmt = "{post_id}_{num}" + request_interval = (0.5, 1.5) + + def items(self): + for post_url in self.posts(): + parts = post_url.split("/") + if post_url[0] == "/": + post_url = self.root + post_url + page = self.request(post_url).text + extr = text.extract_from(page) + + post = { + "post_category": extr("[", "]"), + "count" : extr("(", " "), + "post_id" : parts[-1].partition(".")[0], + "user_id" : parts[-2], + "user_name" : text.unescape(extr( + '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]), + "description": text.unescape(extr( + 'class="IllustItemDesc" >', '<')), + } + + yield Message.Directory, post + post["num"] = 0 + + while True: + thumb = extr('class="IllustItemThumbImg" src="', '"') + if not thumb: + break + elif thumb.startswith("/img/"): + continue + post["num"] += 1 + url = text.ensure_http_scheme(thumb[:-8]) + yield Message.Url, url, text.nameext_from_url(url, post) + + if not extr('</i> show all', '<'): + continue + + url = self.root + "/f/ShowAppendFileF.jsp" + headers = { + "Accept" : "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer": post_url, + } + data = { + "UID": post["user_id"], + "IID": post["post_id"], + "PAS": "", + "MD" : "0", + "TWF": "-1", + } + page = self.request( + url, method="POST", headers=headers, data=data).json()["html"] + + for thumb in text.extract_iter( + page, 'class="IllustItemThumbImg" src="', '"'): + post["num"] += 1 + url = text.ensure_http_scheme(thumb[:-8]) + yield Message.Url, url, text.nameext_from_url(url, post) + + +class PoipikuUserExtractor(PoipikuExtractor): + """Extractor for posts from a poipiku user""" + subcategory = "user" + pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?" + r"(\d+)/?(?:$|[?&#])") + test = ( + ("https://poipiku.com/25049/", { + "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" + r"/\d+_\w+\.(jpe?g|png)$", + "range": "1-10", + "count": 10, + }), + ("https://poipiku.com/IllustListPcV.jsp?PG=1&ID=25049&KWD=") + ) + + def __init__(self, match): + PoipikuExtractor.__init__(self, match) + self._page, self.user_id = match.groups() + + def posts(self): + url = self.root + "/IllustListPcV.jsp" + params = { + "PG" : text.parse_int(self._page, 0), + "ID" : self.user_id, + "KWD": "", + } + + while True: + page = self.request(url, params=params).text + + cnt = 0 + for path in text.extract_iter( + page, 'class="IllustInfo" href="', '"'): + yield path + cnt += 1 + + if cnt < 48: + return + params["PG"] += 1 + + +class PoipikuPostExtractor(PoipikuExtractor): + """Extractor for a poipiku post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(\d+)/(\d+)" + test = ( + ("https://poipiku.com/25049/5864576.html", { + "pattern": r"https://img\.poipiku\.com/user_img03/000025049" + r"/005864576_EWN1Y65gQ\.png$", + "keyword": { + "count": "1", + "description": "", + "extension": "png", + "filename": "005864576_EWN1Y65gQ", + "num": 1, + "post_category": "DOODLE", + "post_id": "5864576", + "user_id": "25049", + "user_name": "ユキウサギ", + }, + }), + ("https://poipiku.com/2166245/6411749.html", { + "pattern": r"https://img\.poipiku\.com/user_img01/002166245" + r"/006411749_\w+\.jpeg$", + "count": 4, + "keyword": { + "count": "4", + "description": "絵茶の産物ネタバレあるやつ", + "num": int, + "post_category": "SPOILER", + "post_id": "6411749", + "user_id": "2166245", + "user_name": "wadahito", + }, + }), + ) + + def __init__(self, match): + PoipikuExtractor.__init__(self, match) + self.user_id, self.post_id = match.groups() + + def posts(self): + return ("/{}/{}.html".format(self.user_id, self.post_id),)