gallery-dl/gallery_dl/extractor/xhamster.py

# -*- coding: utf-8 -*-

# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://xhamster.com/"""

from .common import Extractor, Message
from .. import text
import json


BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster"
                r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))")


class XhamsterExtractor(Extractor):
    """Base class for xhamster extractors"""
    category = "xhamster"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.root = "https://" + match.group(1)


class XhamsterGalleryExtractor(XhamsterExtractor):
    """Extractor for image galleries on xhamster.com"""
    subcategory = "gallery"
    directory_fmt = ("{category}", "{user[name]}",
                     "{gallery[id]} {gallery[title]}")
    filename_fmt = "{num:>03}_{id}.{extension}"
    archive_fmt = "{id}"
    pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)"
    test = (
        ("https://xhamster.com/photos/gallery/11748968", {
            "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
            "count": ">= 144",
            "keyword": {
                "comments": int,
                "count": int,
                "favorite": bool,
                "id": int,
                "num": int,
                "height": int,
                "width": int,
                "imageURL": str,
                "pageURL": str,
                "thumbURL": str,
                "gallery": {
                    "date": "dt:2019-04-16 00:07:31",
                    "description": "",
                    "dislikes": int,
                    "id": 11748968,
                    "likes": int,
                    "tags": ["NON-Porn"],
                    "thumbnail": str,
                    "title": "Make the world better.",
                    "views": int,
                },
                "user": {
                    "id": 16874672,
                    "name": "Anonymousrants",
                    "retired": bool,
                    "subscribers": int,
                    "url": "https://xhamster.com/users/anonymousrants",
                    "verified": bool,
                },
            },
        }),
        ("https://jp.xhamster2.com/photos/gallery/11748968", {
            "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
            "count": ">= 144",
        }),
        ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),
        ("https://xhamster.com/photos/gallery/11748968"),
        ("https://xhamster.one/photos/gallery/11748968"),
        ("https://xhamster.desi/photos/gallery/11748968"),
        ("https://xhamster2.com/photos/gallery/11748968"),
        ("https://en.xhamster.com/photos/gallery/11748968"),
        ("https://xhamster.porncache.net/photos/gallery/11748968"),
    )

    def __init__(self, match):
        XhamsterExtractor.__init__(self, match)
        self.path = match.group(2)
        self.data = None

    def items(self):
        data = self.metadata()
        yield Message.Directory, data
        for num, image in enumerate(self.images(), 1):
            url = image["imageURL"]
            image.update(data)
            image["num"] = num
            yield Message.Url, url, text.nameext_from_url(url, image)

    def metadata(self):
        self.data = self._data(self.root + self.path)
        user = self.data["authorModel"]
        imgs = self.data["photosGalleryModel"]

        return {
            "user":
            {
                "id"         : text.parse_int(user["id"]),
                "url"        : user["pageURL"],
                "name"       : user["name"],
                "retired"    : user["retired"],
                "verified"   : user["verified"],
                "subscribers": user["subscribers"],
            },
            "gallery":
            {
                "id"         : text.parse_int(imgs["id"]),
                "tags"       : [c["name"] for c in imgs["categories"]],
                "date"       : text.parse_timestamp(imgs["created"]),
                "views"      : text.parse_int(imgs["views"]),
                "likes"      : text.parse_int(imgs["rating"]["likes"]),
                "dislikes"   : text.parse_int(imgs["rating"]["dislikes"]),
                "title"      : text.unescape(imgs["title"]),
                "description": text.unescape(imgs["description"]),
                "thumbnail"  : imgs["thumbURL"],
            },
            "count": text.parse_int(imgs["quantity"]),
        }

    def images(self):
        data = self.data
        self.data = None

        while True:
            for image in data["photosGalleryModel"]["photos"]:
                del image["modelName"]
                yield image

            pgntn = data["pagination"]
            if pgntn["active"] == pgntn["maxPage"]:
                return
            url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])
            data = self._data(url)

    def _data(self, url):
        page = self.request(url).text
        return json.loads(text.extr(
            page, "window.initials=", "</script>").rstrip("\n\r;"))


class XhamsterUserExtractor(XhamsterExtractor):
    """Extractor for all galleries of an xhamster user"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])"
    test = (
        ("https://xhamster.com/users/goldenpalomino/photos", {
            "pattern": XhamsterGalleryExtractor.pattern,
            "count": 50,
            "range": "1-50",
        }),
        ("https://xhamster.com/users/nickname68"),
    )

    def __init__(self, match):
        XhamsterExtractor.__init__(self, match)
        self.user = match.group(2)

    def items(self):
        url = "{}/users/{}/photos".format(self.root, self.user)
        data = {"_extractor": XhamsterGalleryExtractor}

        while url:
            extr = text.extract_from(self.request(url).text)
            while True:
                url = extr('thumb-image-container role-pop" href="', '"')
                if not url:
                    break
                yield Message.Queue, url, data
            url = extr('data-page="next" href="', '"')
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`# -- coding: utf-8 --`

add tests for specific datetime values 2020-02-23 16:48:30 +01:00			`# Copyright 2019-2020 Mike Fährmann`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://xhamster.com/"""`

			`from .common import Extractor, Message`
			`from .. import text`
			`import json`


generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 2021-12-29 22:39:29 +01:00			`BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster"`
[xhamster] support xhamster.porncache.net domains (closes #700) 2020-04-22 18:31:05 +02:00			`r"(?:\d?\.(?:com\|one\|desi)\|\.porncache\.net))")`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00

			`class XhamsterExtractor(Extractor):`
			`"""Base class for xhamster extractors"""`
			`category = "xhamster"`
[xhamster] use input URL domain Don't rewrite all URLs as 'https://xhamster.com/...' 2019-08-13 23:45:33 +02:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.root = "https://" + match.group(1)`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00

			`class XhamsterGalleryExtractor(XhamsterExtractor):`
			`"""Extractor for image galleries on xhamster.com"""`
			`subcategory = "gallery"`
			`directory_fmt = ("{category}", "{user[name]}",`
			`"{gallery[id]} {gallery[title]}")`
			`filename_fmt = "{num:>03}_{id}.{extension}"`
			`archive_fmt = "{id}"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)"`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`test = (`
			`("https://xhamster.com/photos/gallery/11748968", {`
			`"pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",`
[xhamster] update test results 2019-06-07 16:28:49 +02:00			`"count": ">= 144",`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`"keyword": {`
			`"comments": int,`
[xhamster] update test results 2019-06-07 16:28:49 +02:00			`"count": int,`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`"favorite": bool,`
			`"id": int,`
			`"num": int,`
			`"height": int,`
			`"width": int,`
			`"imageURL": str,`
			`"pageURL": str,`
			`"thumbURL": str,`
			`"gallery": {`
add tests for specific datetime values 2020-02-23 16:48:30 +01:00			`"date": "dt:2019-04-16 00:07:31",`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`"description": "",`
			`"dislikes": int,`
			`"id": 11748968,`
			`"likes": int,`
			`"tags": ["NON-Porn"],`
			`"thumbnail": str,`
			`"title": "Make the world better.",`
			`"views": int,`
			`},`
			`"user": {`
			`"id": 16874672,`
			`"name": "Anonymousrants",`
			`"retired": bool,`
			`"subscribers": int,`
			`"url": "https://xhamster.com/users/anonymousrants",`
			`"verified": bool,`
			`},`
			`},`
			`}),`
[xhamster] use input URL domain Don't rewrite all URLs as 'https://xhamster.com/...' 2019-08-13 23:45:33 +02:00			`("https://jp.xhamster2.com/photos/gallery/11748968", {`
			`"pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",`
			`"count": ">= 144",`
			`}),`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),`
			`("https://xhamster.com/photos/gallery/11748968"),`
			`("https://xhamster.one/photos/gallery/11748968"),`
			`("https://xhamster.desi/photos/gallery/11748968"),`
[xhamster] use input URL domain Don't rewrite all URLs as 'https://xhamster.com/...' 2019-08-13 23:45:33 +02:00			`("https://xhamster2.com/photos/gallery/11748968"),`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`("https://en.xhamster.com/photos/gallery/11748968"),`
[xhamster] support xhamster.porncache.net domains (closes #700) 2020-04-22 18:31:05 +02:00			`("https://xhamster.porncache.net/photos/gallery/11748968"),`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`)`

			`def __init__(self, match):`
			`XhamsterExtractor.__init__(self, match)`
[xhamster] use input URL domain Don't rewrite all URLs as 'https://xhamster.com/...' 2019-08-13 23:45:33 +02:00			`self.path = match.group(2)`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`self.data = None`

			`def items(self):`
			`data = self.metadata()`
			`yield Message.Directory, data`
			`for num, image in enumerate(self.images(), 1):`
			`url = image["imageURL"]`
			`image.update(data)`
			`image["num"] = num`
			`yield Message.Url, url, text.nameext_from_url(url, image)`

			`def metadata(self):`
			`self.data = self._data(self.root + self.path)`
			`user = self.data["authorModel"]`
			`imgs = self.data["photosGalleryModel"]`

			`return {`
			`"user":`
			`{`
			`"id" : text.parse_int(user["id"]),`
			`"url" : user["pageURL"],`
			`"name" : user["name"],`
			`"retired" : user["retired"],`
			`"verified" : user["verified"],`
			`"subscribers": user["subscribers"],`
			`},`
			`"gallery":`
			`{`
			`"id" : text.parse_int(imgs["id"]),`
			`"tags" : [c["name"] for c in imgs["categories"]],`
			`"date" : text.parse_timestamp(imgs["created"]),`
			`"views" : text.parse_int(imgs["views"]),`
			`"likes" : text.parse_int(imgs["rating"]["likes"]),`
			`"dislikes" : text.parse_int(imgs["rating"]["dislikes"]),`
[xhamster] unescape 'title' and 'description' 2019-10-04 14:44:51 +02:00			`"title" : text.unescape(imgs["title"]),`
			`"description": text.unescape(imgs["description"]),`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`"thumbnail" : imgs["thumbURL"],`
			`},`
			`"count": text.parse_int(imgs["quantity"]),`
			`}`

			`def images(self):`
			`data = self.data`
			`self.data = None`

			`while True:`
			`for image in data["photosGalleryModel"]["photos"]:`
			`del image["modelName"]`
			`yield image`

			`pgntn = data["pagination"]`
			`if pgntn["active"] == pgntn["maxPage"]:`
			`return`
			`url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])`
			`data = self._data(url)`

			`def _data(self, url):`
			`page = self.request(url).text`
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`return json.loads(text.extr(`
			`page, "window.initials=", "</script>").rstrip("\n\r;"))`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00

			`class XhamsterUserExtractor(XhamsterExtractor):`
			`"""Extractor for all galleries of an xhamster user"""`
			`subcategory = "user"`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$\|[?#])"`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`test = (`
[pinterest] improve detection of invalid pin.it links 2020-01-18 21:06:44 +01:00			`("https://xhamster.com/users/goldenpalomino/photos", {`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`"pattern": XhamsterGalleryExtractor.pattern,`
			`"count": 50,`
			`"range": "1-50",`
			`}),`
			`("https://xhamster.com/users/nickname68"),`
			`)`

			`def __init__(self, match):`
			`XhamsterExtractor.__init__(self, match)`
[xhamster] use input URL domain Don't rewrite all URLs as 'https://xhamster.com/...' 2019-08-13 23:45:33 +02:00			`self.user = match.group(2)`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00
			`def items(self):`
			`url = "{}/users/{}/photos".format(self.root, self.user)`
			`data = {"_extractor": XhamsterGalleryExtractor}`

			`while url:`
			`extr = text.extract_from(self.request(url).text)`
			`while True:`
[xhamster] fix user profile extraction 2020-10-15 18:57:35 +02:00			`url = extr('thumb-image-container role-pop" href="', '"')`
[xhamster] add gallery & user extractor (#281) 2019-06-04 22:23:32 +02:00			`if not url:`
			`break`
			`yield Message.Queue, url, data`
			`url = extr('data-page="next" href="', '"')`