gallery-dl/gallery_dl/extractor/paheal.py

# -*- coding: utf-8 -*-

# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://rule34.paheal.net/"""

from .common import Extractor, Message
from .. import text


class PahealExtractor(Extractor):
    """Base class for paheal extractors"""
    basecategory = "shimmie2"
    category = "paheal"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    archive_fmt = "{id}"
    root = "https://rule34.paheal.net"

    def items(self):
        self.cookies.set(
            "ui-tnc-agreed", "true", domain="rule34.paheal.net")
        data = self.get_metadata()

        for post in self.get_posts():
            url = post["file_url"]
            for key in ("id", "width", "height"):
                post[key] = text.parse_int(post[key])
            post["tags"] = text.unquote(post["tags"])
            post.update(data)
            yield Message.Directory, post
            yield Message.Url, url, post

    def get_metadata(self):
        """Return general metadata"""
        return {}

    def get_posts(self):
        """Return an iterable containing data of all relevant posts"""

    def _extract_post(self, post_id):
        url = "{}/post/view/{}".format(self.root, post_id)
        extr = text.extract_from(self.request(url).text)

        post = {
            "id"      : post_id,
            "tags"    : extr(": ", "<"),
            "md5"     : extr("/_thumbs/", "/"),
            "file_url": (extr("id='main_image' src='", "'") or
                         extr("<source src='", "'")),
            "uploader": text.unquote(extr(
                "class='username' href='/user/", "'")),
            "date"    : text.parse_datetime(
                extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
            "source"  : text.unescape(text.extr(
                extr(">Source Link<", "</td>"), "href='", "'")),
        }

        dimensions, size, ext = extr("Info</th><td>", "<").split(" // ")
        post["size"] = text.parse_bytes(size[:-1])
        post["width"], _, height = dimensions.partition("x")
        post["height"], _, duration = height.partition(", ")
        post["duration"] = text.parse_float(duration[:-1])
        post["filename"] = "{} - {}".format(post_id, post["tags"])
        post["extension"] = ext

        return post


class PahealTagExtractor(PahealExtractor):
    """Extractor for images from rule34.paheal.net by search-tags"""
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
               r"/post/list/([^/?#]+)")
    example = "https://rule34.paheal.net/post/list/TAG/1"
    per_page = 70

    def __init__(self, match):
        PahealExtractor.__init__(self, match)
        self.tags = text.unquote(match.group(1))

    def _init(self):
        if self.config("metadata"):
            self._extract_data = self._extract_data_ex

    def get_metadata(self):
        return {"search_tags": self.tags}

    def get_posts(self):
        pnum = 1
        while True:
            url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
            page = self.request(url).text

            pos = page.find("id='image-list'")
            for post in text.extract_iter(
                    page, "<img id='thumb_", "Only</a>", pos):
                yield self._extract_data(post)

            if ">Next<" not in page:
                return
            pnum += 1

    @staticmethod
    def _extract_data(post):
        pid , pos = text.extract(post, "", "'")
        data, pos = text.extract(post, "title='", "'", pos)
        md5 , pos = text.extract(post, "/_thumbs/", "/", pos)
        url , pos = text.extract(post, "<a href='", "'", pos)

        tags, data, date = data.split("\n")
        dimensions, size, ext = data.split(" // ")
        tags = text.unescape(tags)
        width, _, height = dimensions.partition("x")
        height, _, duration = height.partition(", ")

        return {
            "id": pid, "md5": md5, "file_url": url,
            "width": width, "height": height,
            "duration": text.parse_float(duration[:-1]),
            "tags": tags,
            "size": text.parse_bytes(size[:-1]),
            "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
            "filename" : "{} - {}".format(pid, tags),
            "extension": ext,
        }

    def _extract_data_ex(self, post):
        pid = post[:post.index("'")]
        return self._extract_post(pid)


class PahealPostExtractor(PahealExtractor):
    """Extractor for single images from rule34.paheal.net"""
    subcategory = "post"
    pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
               r"/post/view/(\d+)")
    example = "https://rule34.paheal.net/post/view/12345"

    def __init__(self, match):
        PahealExtractor.__init__(self, match)
        self.post_id = match.group(1)

    def get_posts(self):
        return (self._extract_post(self.post_id),)
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`# -- coding: utf-8 --`

[shimmie2] add generic extractors for Shimmie2 sites (#3734) add support for - loudbooru.com (#3734) - booru.cavemanon.xyz (#3734) - giantessbooru.com (#943) - tentaclerape.net 2023-04-26 18:01:07 +02:00			`# Copyright 2018-2023 Mike Fährmann`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

merge SharedConfigMixin functionality into Extractor 2020-11-17 00:34:07 +01:00			`"""Extractors for https://rule34.paheal.net/"""`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
merge SharedConfigMixin functionality into Extractor 2020-11-17 00:34:07 +01:00			`from .common import Extractor, Message`
fix util.parse_bytes invocations (should be text.parse_bytes) 2018-05-10 22:07:55 +02:00			`from .. import text`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00

merge SharedConfigMixin functionality into Extractor 2020-11-17 00:34:07 +01:00			`class PahealExtractor(Extractor):`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`"""Base class for paheal extractors"""`
[shimmie2] add generic extractors for Shimmie2 sites (#3734) add support for - loudbooru.com (#3734) - booru.cavemanon.xyz (#3734) - giantessbooru.com (#943) - tentaclerape.net 2023-04-26 18:01:07 +02:00			`basecategory = "shimmie2"`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`category = "paheal"`
			`filename_fmt = "{category}_{id}_{md5}.{extension}"`
set 'archive_fmt' values These are going to be used to create an unique id for each image. 2018-01-30 22:49:16 +01:00			`archive_fmt = "{id}"`
[paheal] use HTTPS 2018-07-17 21:23:50 +02:00			`root = "https://rule34.paheal.net"`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`def items(self):`
consistent cookie-related names - rename every cookie variable or method to 'cookies_*' - simplify '.session.cookies' to just '.cookies' - more consistent 'login()' structure 2023-07-21 22:38:39 +02:00			`self.cookies.set(`
[paheal] fix extraction (fixes #1088) 2020-10-28 21:51:31 +01:00			`"ui-tnc-agreed", "true", domain="rule34.paheal.net")`
[paheal] create directory for each post (closes #1147) 2020-12-01 12:14:55 +01:00			`data = self.get_metadata()`
[paheal] fix extraction (fixes #1088) 2020-10-28 21:51:31 +01:00
[paheal] create directory for each post (closes #1147) 2020-12-01 12:14:55 +01:00			`for post in self.get_posts():`
			`url = post["file_url"]`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`for key in ("id", "width", "height"):`
[paheal] create directory for each post (closes #1147) 2020-12-01 12:14:55 +01:00			`post[key] = text.parse_int(post[key])`
			`post["tags"] = text.unquote(post["tags"])`
			`post.update(data)`
			`yield Message.Directory, post`
[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`yield Message.Url, url, post`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`def get_metadata(self):`
			`"""Return general metadata"""`
			`return {}`

			`def get_posts(self):`
			`"""Return an iterable containing data of all relevant posts"""`

[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00			`def _extract_post(self, post_id):`
			`url = "{}/post/view/{}".format(self.root, post_id)`
			`extr = text.extract_from(self.request(url).text)`

			`post = {`
			`"id" : post_id,`
			`"tags" : extr(": ", "<"),`
			`"md5" : extr("/_thumbs/", "/"),`
{paheal[ add proper support for videos (#2892) 2022-09-04 13:20:30 +02:00			`"file_url": (extr("id='main_image' src='", "'") or`
			`extr("<source src='", "'")),`
[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00			`"uploader": text.unquote(extr(`
			`"class='username' href='/user/", "'")),`
			`"date" : text.parse_datetime(`
			`extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),`
[paheal] unescape 'source' 2023-07-07 20:03:00 +02:00			`"source" : text.unescape(text.extr(`
[paheal] fix 'source' metadata 2024-01-19 22:24:39 +01:00			`extr(">Source Link<", "</td>"), "href='", "'")),`
[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00			`}`

[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`dimensions, size, ext = extr("Info</th><td>", "<").split(" // ")`
[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00			`post["size"] = text.parse_bytes(size[:-1])`
[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`post["width"], _, height = dimensions.partition("x")`
{paheal[ add proper support for videos (#2892) 2022-09-04 13:20:30 +02:00			`post["height"], _, duration = height.partition(", ")`
			`post["duration"] = text.parse_float(duration[:-1])`
[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`post["filename"] = "{} - {}".format(post_id, post["tags"])`
			`post["extension"] = ext`
[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00
[paheal] add 'metadata' option (#2641) 2022-06-04 16:05:49 +02:00			`return post`
[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`class PahealTagExtractor(PahealExtractor):`
			`"""Extractor for images from rule34.paheal.net by search-tags"""`
			`subcategory = "tag"`
[paheal] rename "tags" to "search_tags" to better match field names of other booru extractors 2019-02-15 16:40:15 +01:00			`directory_fmt = ("{category}", "{search_tags}")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:rule34\|rule63\|cosplay)\.paheal\.net"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`r"/post/list/([^/?#]+)")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://rule34.paheal.net/post/list/TAG/1"`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`per_page = 70`

			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`PahealExtractor.__init__(self, match)`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`self.tags = text.unquote(match.group(1))`

decouple extractor initialization Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'. 2023-07-25 20:09:44 +02:00			`def _init(self):`
[paheal] add 'metadata' option (#2641) 2022-06-04 16:05:49 +02:00			`if self.config("metadata"):`
			`self._extract_data = self._extract_data_ex`

[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`def get_metadata(self):`
[paheal] rename "tags" to "search_tags" to better match field names of other booru extractors 2019-02-15 16:40:15 +01:00			`return {"search_tags": self.tags}`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`def get_posts(self):`
			`pnum = 1`
			`while True:`
			`url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)`
			`page = self.request(url).text`

[paheal] fix extraction (#4262) swap ' and " 2023-07-04 17:36:41 +02:00			`pos = page.find("id='image-list'")`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`for post in text.extract_iter(`
[paheal] fix extraction (#4262) swap ' and " 2023-07-04 17:36:41 +02:00			`page, "<img id='thumb_", "Only</a>", pos):`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`yield self._extract_data(post)`

			`if ">Next<" not in page:`
			`return`
			`pnum += 1`

			`@staticmethod`
			`def _extract_data(post):`
[paheal] fix extraction (#4262) swap ' and " 2023-07-04 17:36:41 +02:00			`pid , pos = text.extract(post, "", "'")`
			`data, pos = text.extract(post, "title='", "'", pos)`
			`md5 , pos = text.extract(post, "/_thumbs/", "/", pos)`
			`url , pos = text.extract(post, "<a href='", "'", pos)`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
[paheal] fix extraction (fixes #1088) 2020-10-28 21:51:31 +01:00			`tags, data, date = data.split("\n")`
			`dimensions, size, ext = data.split(" // ")`
[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`tags = text.unescape(tags)`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`width, _, height = dimensions.partition("x")`
{paheal[ add proper support for videos (#2892) 2022-09-04 13:20:30 +02:00			`height, _, duration = height.partition(", ")`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`return {`
[paheal] improve metadata extraction (#2641) - unescape 'tags' - add 'date', 'source', and 'uploader' for single posts 2022-05-30 17:23:08 +02:00			`"id": pid, "md5": md5, "file_url": url,`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`"width": width, "height": height,`
{paheal[ add proper support for videos (#2892) 2022-09-04 13:20:30 +02:00			`"duration": text.parse_float(duration[:-1]),`
[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`"tags": tags,`
fix util.parse_bytes invocations (should be text.parse_bytes) 2018-05-10 22:07:55 +02:00			`"size": text.parse_bytes(size[:-1]),`
[paheal] add 'metadata' option (#2641) 2022-06-04 16:05:49 +02:00			`"date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),`
[paheal] restore 'extension' metadata (#4976) 2023-12-26 16:09:26 +01:00			`"filename" : "{} - {}".format(pid, tags),`
			`"extension": ext,`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`}`

[paheal] add 'metadata' option (#2641) 2022-06-04 16:05:49 +02:00			`def _extract_data_ex(self, post):`
[paheal] fix a78f8ce5 for enabled 'metadata' (#4262) 2023-07-07 20:00:49 +02:00			`pid = post[:post.index("'")]`
[paheal] add 'metadata' option (#2641) 2022-06-04 16:05:49 +02:00			`return self._extract_post(pid)`

[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`class PahealPostExtractor(PahealExtractor):`
			`"""Extractor for single images from rule34.paheal.net"""`
			`subcategory = "post"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`pattern = (r"(?:https?://)?(?:rule34\|rule63\|cosplay)\.paheal\.net"`
			`r"/post/view/(\d+)")`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://rule34.paheal.net/post/view/12345"`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`PahealExtractor.__init__(self, match)`
[paheal] add tag- and post-extractors (closes #69) 2018-01-15 16:39:05 +01:00			`self.post_id = match.group(1)`

			`def get_posts(self):`
[paheal] add 'metadata' option (#2641) 2022-06-04 16:05:49 +02:00			`return (self._extract_post(self.post_id),)`