gallery-dl/gallery_dl/extractor/hotleak.py

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://hotleak.vip/"""

from .common import Extractor, Message
from .. import text, exception
import binascii

BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip"


class HotleakExtractor(Extractor):
    """Base class for hotleak extractors"""
    category = "hotleak"
    directory_fmt = ("{category}", "{creator}",)
    filename_fmt = "{creator}_{id}.{extension}"
    archive_fmt = "{type}_{creator}_{id}"
    root = "https://hotleak.vip"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.session.headers["Referer"] = self.root

    def items(self):
        for post in self.posts():
            yield Message.Directory, post
            yield Message.Url, post["url"], post

    def posts(self):
        """Return an iterable containing relevant posts"""
        return ()

    def _pagination(self, url, params):
        params = text.parse_query(params)
        params["page"] = text.parse_int(params.get("page"), 1)

        while True:
            page = self.request(url, params=params).text
            if "</article>" not in page:
                return

            for item in text.extract_iter(
                    page, '<article class="movie-item', '</article>'):
                yield text.extr(item, '<a href="', '"')

            params["page"] += 1


def decode_video_url(url):
    # cut first and last 16 characters, reverse, base64 decode
    return binascii.a2b_base64(url[-17:15:-1]).decode()


class HotleakPostExtractor(HotleakExtractor):
    """Extractor for individual posts on hotleak"""
    subcategory = "post"
    pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))"
               r"([^/]+)/(photo|video)/(\d+)")
    test = (
        ("https://hotleak.vip/kaiyakawaii/photo/1617145", {
            "pattern": r"https://hotleak\.vip/storage/images/3625"
                       r"/1617145/fefdd5988dfcf6b98cc9e11616018868\.jpg",
            "keyword": {
                "id": 1617145,
                "creator": "kaiyakawaii",
                "type": "photo",
                "filename": "fefdd5988dfcf6b98cc9e11616018868",
                "extension": "jpg",
            },
        }),
        ("https://hotleak.vip/lilmochidoll/video/1625538", {
            "pattern": r"ytdl:https://cdn8-leak\.camhdxx\.com"
                       r"/1661/1625538/index\.m3u8",
            "keyword": {
                "id": 1625538,
                "creator": "lilmochidoll",
                "type": "video",
                "filename": "index",
                "extension": "mp4",
            },
        }),
    )

    def __init__(self, match):
        HotleakExtractor.__init__(self, match)
        self.creator, self.type, self.id = match.groups()

    def posts(self):
        url = "{}/{}/{}/{}".format(
            self.root, self.creator, self.type, self.id)
        page = self.request(url).text
        page = text.extr(
            page, '<div class="movie-image thumb">', '</article>')
        data = {
            "id"     : text.parse_int(self.id),
            "creator": self.creator,
            "type"   : self.type,
        }

        if self.type == "photo":
            data["url"] = text.extr(page, 'data-src="', '"')
            text.nameext_from_url(data["url"], data)

        elif self.type == "video":
            data["url"] = "ytdl:" + decode_video_url(text.extr(
                text.unescape(page), '"src":"', '"'))
            text.nameext_from_url(data["url"], data)
            data["extension"] = "mp4"

        return (data,)


class HotleakCreatorExtractor(HotleakExtractor):
    """Extractor for all posts from a hotleak creator"""
    subcategory = "creator"
    pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))"
               r"([^/?#]+)/?$")
    test = (
        ("https://hotleak.vip/kaiyakawaii", {
            "range": "1-200",
            "count": 200,
        }),
        ("https://hotleak.vip/stellaviolet", {
            "count": "> 600"
        }),
        ("https://hotleak.vip/doesnotexist", {
            "exception": exception.NotFoundError,
        }),
    )

    def __init__(self, match):
        HotleakExtractor.__init__(self, match)
        self.creator = match.group(1)

    def posts(self):
        url = "{}/{}".format(self.root, self.creator)
        return self._pagination(url)

    def _pagination(self, url):
        headers = {"X-Requested-With": "XMLHttpRequest"}
        params = {"page": 1}

        while True:
            try:
                response = self.request(
                    url, headers=headers, params=params, notfound="creator")
            except exception.HttpError as exc:
                if exc.response.status_code == 429:
                    self.wait(
                        until=exc.response.headers.get("X-RateLimit-Reset"))
                    continue
                raise

            posts = response.json()
            if not posts:
                return

            data = {"creator": self.creator}
            for post in posts:
                data["id"] = text.parse_int(post["id"])

                if post["type"] == 0:
                    data["type"] = "photo"
                    data["url"] = self.root + "/storage/" + post["image"]
                    text.nameext_from_url(data["url"], data)

                elif post["type"] == 1:
                    data["type"] = "video"
                    data["url"] = "ytdl:" + decode_video_url(
                        post["stream_url_play"])
                    text.nameext_from_url(data["url"], data)
                    data["extension"] = "mp4"

                yield data
            params["page"] += 1


class HotleakCategoryExtractor(HotleakExtractor):
    """Extractor for hotleak categories"""
    subcategory = "category"
    pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?"
    test = (
        ("https://hotleak.vip/photos", {
            "pattern": HotleakPostExtractor.pattern,
            "range": "1-50",
            "count": 50,
        }),
        ("https://hotleak.vip/videos"),
        ("https://hotleak.vip/creators", {
            "pattern": HotleakCreatorExtractor.pattern,
            "range": "1-50",
            "count": 50,
        }),
        ("https://hotleak.vip/hot"),
    )

    def __init__(self, match):
        HotleakExtractor.__init__(self, match)
        self._category, self.params = match.groups()

    def items(self):
        url = "{}/{}".format(self.root, self._category)

        if self._category in ("hot", "creators"):
            data = {"_extractor": HotleakCreatorExtractor}
        elif self._category in ("videos", "photos"):
            data = {"_extractor": HotleakPostExtractor}

        for item in self._pagination(url, self.params):
            yield Message.Queue, item, data


class HotleakSearchExtractor(HotleakExtractor):
    """Extractor for hotleak search results"""
    subcategory = "search"
    pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))"
    test = (
        ("https://hotleak.vip/search?search=gallery-dl", {
            "count": 0,
        }),
        ("https://hotleak.vip/search?search=hannah", {
            "count": "> 30",
        }),
    )

    def __init__(self, match):
        HotleakExtractor.__init__(self, match)
        self.params = match.group(1)

    def items(self):
        data = {"_extractor": HotleakCreatorExtractor}
        for creator in self._pagination(self.root + "/search", self.params):
            yield Message.Queue, creator, data
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`# -- coding: utf-8 --`

			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://hotleak.vip/"""`

			`from .common import Extractor, Message`
			`from .. import text, exception`
[hotleak] optimize decoding video URLs - use binascii module - combine slice and reverse step 2023-01-28 15:27:08 +01:00			`import binascii`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00
			`BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip"`


			`class HotleakExtractor(Extractor):`
			`"""Base class for hotleak extractors"""`
			`category = "hotleak"`
			`directory_fmt = ("{category}", "{creator}",)`
			`filename_fmt = "{creator}_{id}.{extension}"`
			`archive_fmt = "{type}_{creator}_{id}"`
			`root = "https://hotleak.vip"`

			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.session.headers["Referer"] = self.root`

			`def items(self):`
			`for post in self.posts():`
			`yield Message.Directory, post`
			`yield Message.Url, post["url"], post`

			`def posts(self):`
			`"""Return an iterable containing relevant posts"""`
			`return ()`

			`def _pagination(self, url, params):`
			`params = text.parse_query(params)`
			`params["page"] = text.parse_int(params.get("page"), 1)`

			`while True:`
			`page = self.request(url, params=params).text`
			`if "</article>" not in page:`
			`return`

			`for item in text.extract_iter(`
			`page, '<article class="movie-item', '</article>'):`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`yield text.extr(item, '<a href="', '"')`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00
			`params["page"] += 1`


[hotleak] optimize decoding video URLs - use binascii module - combine slice and reverse step 2023-01-28 15:27:08 +01:00			`def decode_video_url(url):`
			`# cut first and last 16 characters, reverse, base64 decode`
			`return binascii.a2b_base64(url[-17:15:-1]).decode()`
Make decode_video_url static (used in both post and creator extractor). 2023-01-28 15:36:49 +01:00

[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`class HotleakPostExtractor(HotleakExtractor):`
			`"""Extractor for individual posts on hotleak"""`
			`subcategory = "post"`
[hotleak] Fix downloading of creators whose name starts with a category name E.g. `hot4lexi` would start downloading the `hot` section by mistake This happened because the regex had a negative lookahead for the category names, but didn't ensure that they where followed by either end-of-string or a slash. 2023-04-03 15:30:27 +02:00			`pattern = (BASE_PATTERN + r"/(?!(?:hot\|creators\|videos\|photos)(?:$\|/))"`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`r"([^/]+)/(photo\|video)/(\d+)")`
			`test = (`
			`("https://hotleak.vip/kaiyakawaii/photo/1617145", {`
			`"pattern": r"https://hotleak\.vip/storage/images/3625"`
			`r"/1617145/fefdd5988dfcf6b98cc9e11616018868\.jpg",`
			`"keyword": {`
			`"id": 1617145,`
			`"creator": "kaiyakawaii",`
			`"type": "photo",`
			`"filename": "fefdd5988dfcf6b98cc9e11616018868",`
			`"extension": "jpg",`
			`},`
			`}),`
			`("https://hotleak.vip/lilmochidoll/video/1625538", {`
			`"pattern": r"ytdl:https://cdn8-leak\.camhdxx\.com"`
			`r"/1661/1625538/index\.m3u8",`
			`"keyword": {`
			`"id": 1625538,`
			`"creator": "lilmochidoll",`
			`"type": "video",`
			`"filename": "index",`
			`"extension": "mp4",`
			`},`
			`}),`
			`)`

			`def __init__(self, match):`
			`HotleakExtractor.__init__(self, match)`
			`self.creator, self.type, self.id = match.groups()`

			`def posts(self):`
			`url = "{}/{}/{}/{}".format(`
			`self.root, self.creator, self.type, self.id)`
			`page = self.request(url).text`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`page = text.extr(`
			`page, '<div class="movie-image thumb">', '</article>')`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`data = {`
			`"id" : text.parse_int(self.id),`
			`"creator": self.creator,`
			`"type" : self.type,`
			`}`

			`if self.type == "photo":`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`data["url"] = text.extr(page, 'data-src="', '"')`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`text.nameext_from_url(data["url"], data)`

			`elif self.type == "video":`
Make decode_video_url static (used in both post and creator extractor). 2023-01-28 15:36:49 +01:00			`data["url"] = "ytdl:" + decode_video_url(text.extr(`
Reverse engineered obfuscated JS function and reimplemented in python. 2023-01-27 22:30:06 +01:00			`text.unescape(page), '"src":"', '"'))`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`text.nameext_from_url(data["url"], data)`
			`data["extension"] = "mp4"`

			`return (data,)`


			`class HotleakCreatorExtractor(HotleakExtractor):`
			`"""Extractor for all posts from a hotleak creator"""`
			`subcategory = "creator"`
Fix line length 2023-04-03 15:38:42 +02:00			`pattern = (BASE_PATTERN + r"/(?!(?:hot\|creators\|videos\|photos)(?:$\|/))"`
Fix indentation 2023-04-03 15:44:14 +02:00			`r"([^/?#]+)/?$")`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`test = (`
			`("https://hotleak.vip/kaiyakawaii", {`
			`"range": "1-200",`
			`"count": 200,`
			`}),`
			`("https://hotleak.vip/stellaviolet", {`
			`"count": "> 600"`
			`}),`
			`("https://hotleak.vip/doesnotexist", {`
			`"exception": exception.NotFoundError,`
			`}),`
			`)`

			`def __init__(self, match):`
			`HotleakExtractor.__init__(self, match)`
			`self.creator = match.group(1)`

			`def posts(self):`
			`url = "{}/{}".format(self.root, self.creator)`
			`return self._pagination(url)`

			`def _pagination(self, url):`
			`headers = {"X-Requested-With": "XMLHttpRequest"}`
			`params = {"page": 1}`

			`while True:`
			`try:`
			`response = self.request(`
			`url, headers=headers, params=params, notfound="creator")`
			`except exception.HttpError as exc:`
			`if exc.response.status_code == 429:`
			`self.wait(`
			`until=exc.response.headers.get("X-RateLimit-Reset"))`
			`continue`
[hotleak] fix UnboundLocalError (#3288, #3293) 2022-11-23 22:21:59 +01:00			`raise`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00
			`posts = response.json()`
			`if not posts:`
			`return`

			`data = {"creator": self.creator}`
			`for post in posts:`
			`data["id"] = text.parse_int(post["id"])`

			`if post["type"] == 0:`
			`data["type"] = "photo"`
			`data["url"] = self.root + "/storage/" + post["image"]`
			`text.nameext_from_url(data["url"], data)`

			`elif post["type"] == 1:`
			`data["type"] = "video"`
Make decode_video_url static (used in both post and creator extractor). 2023-01-28 15:36:49 +01:00			`data["url"] = "ytdl:" + decode_video_url(`
Tidy up code. 2023-01-27 22:52:47 +01:00			`post["stream_url_play"])`
[hotleak] add hotleak extractor (#2909) (#2890) 2022-09-18 13:37:16 +02:00			`text.nameext_from_url(data["url"], data)`
			`data["extension"] = "mp4"`

			`yield data`
			`params["page"] += 1`


			`class HotleakCategoryExtractor(HotleakExtractor):`
			`"""Extractor for hotleak categories"""`
			`subcategory = "category"`
			`pattern = BASE_PATTERN + r"/(hot\|creators\|videos\|photos)(?:/?\?([^#]+))?"`
			`test = (`
			`("https://hotleak.vip/photos", {`
			`"pattern": HotleakPostExtractor.pattern,`
			`"range": "1-50",`
			`"count": 50,`
			`}),`
			`("https://hotleak.vip/videos"),`
			`("https://hotleak.vip/creators", {`
			`"pattern": HotleakCreatorExtractor.pattern,`
			`"range": "1-50",`
			`"count": 50,`
			`}),`
			`("https://hotleak.vip/hot"),`
			`)`

			`def __init__(self, match):`
			`HotleakExtractor.__init__(self, match)`
			`self._category, self.params = match.groups()`

			`def items(self):`
			`url = "{}/{}".format(self.root, self._category)`

			`if self._category in ("hot", "creators"):`
			`data = {"_extractor": HotleakCreatorExtractor}`
			`elif self._category in ("videos", "photos"):`
			`data = {"_extractor": HotleakPostExtractor}`

			`for item in self._pagination(url, self.params):`
			`yield Message.Queue, item, data`


			`class HotleakSearchExtractor(HotleakExtractor):`
			`"""Extractor for hotleak search results"""`
			`subcategory = "search"`
			`pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))"`
			`test = (`
			`("https://hotleak.vip/search?search=gallery-dl", {`
			`"count": 0,`
			`}),`
			`("https://hotleak.vip/search?search=hannah", {`
			`"count": "> 30",`
			`}),`
			`)`

			`def __init__(self, match):`
			`HotleakExtractor.__init__(self, match)`
			`self.params = match.group(1)`

			`def items(self):`
			`data = {"_extractor": HotleakCreatorExtractor}`
			`for creator in self._pagination(self.root + "/search", self.params):`
			`yield Message.Queue, creator, data`