2019-06-07 16:31:20 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-06-15 16:24:18 +02:00
|
|
|
# Copyright 2019-2023 Mike Fährmann
|
2019-06-07 16:31:20 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for https://www.pornhub.com/"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text, exception
|
|
|
|
|
2021-12-29 22:39:29 +01:00
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com"
|
2019-06-07 16:31:20 +02:00
|
|
|
|
|
|
|
|
|
|
|
class PornhubExtractor(Extractor):
|
|
|
|
"""Base class for pornhub extractors"""
|
|
|
|
category = "pornhub"
|
|
|
|
root = "https://www.pornhub.com"
|
|
|
|
|
2023-08-29 19:34:27 +02:00
|
|
|
def _init(self):
|
|
|
|
self.cookies.set(
|
|
|
|
"accessAgeDisclaimerPH", "1", domain=".pornhub.com")
|
|
|
|
|
|
|
|
def _pagination(self, user, path):
|
|
|
|
if "/" not in path:
|
|
|
|
path += "/public"
|
|
|
|
|
|
|
|
url = "{}/{}/{}/ajax".format(self.root, user, path)
|
|
|
|
params = {"page": 1}
|
|
|
|
headers = {
|
|
|
|
"Referer": url[:-5],
|
|
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
|
|
}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
response = self.request(
|
|
|
|
url, method="POST", headers=headers, params=params,
|
|
|
|
allow_redirects=False)
|
|
|
|
|
|
|
|
if 300 <= response.status_code < 400:
|
|
|
|
url = "{}{}/{}/ajax".format(
|
|
|
|
self.root, response.headers["location"], path)
|
|
|
|
continue
|
|
|
|
|
|
|
|
yield response.text
|
|
|
|
|
|
|
|
params["page"] += 1
|
|
|
|
|
2019-06-07 16:31:20 +02:00
|
|
|
|
|
|
|
class PornhubGalleryExtractor(PornhubExtractor):
|
|
|
|
"""Extractor for image galleries on pornhub.com"""
|
|
|
|
subcategory = "gallery"
|
|
|
|
directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}")
|
|
|
|
filename_fmt = "{num:>03}_{id}.{extension}"
|
|
|
|
archive_fmt = "{id}"
|
|
|
|
pattern = BASE_PATTERN + r"/album/(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.pornhub.com/album/12345"
|
2019-06-07 16:31:20 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
PornhubExtractor.__init__(self, match)
|
|
|
|
self.gallery_id = match.group(1)
|
|
|
|
self._first = None
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
data = self.metadata()
|
|
|
|
yield Message.Directory, data
|
|
|
|
for num, image in enumerate(self.images(), 1):
|
|
|
|
url = image["url"]
|
|
|
|
image.update(data)
|
|
|
|
image["num"] = num
|
|
|
|
yield Message.Url, url, text.nameext_from_url(url, image)
|
|
|
|
|
|
|
|
def metadata(self):
|
|
|
|
url = "{}/album/{}".format(
|
|
|
|
self.root, self.gallery_id)
|
|
|
|
extr = text.extract_from(self.request(url).text)
|
|
|
|
|
|
|
|
title = extr("<title>", "</title>")
|
|
|
|
score = extr('<div id="albumGreenBar" style="width:', '"')
|
|
|
|
views = extr('<div id="viewsPhotAlbumCounter">', '<')
|
|
|
|
tags = extr('<div id="photoTagsBox"', '<script')
|
|
|
|
self._first = extr('<a href="/photo/', '"')
|
|
|
|
title, _, user = title.rpartition(" - ")
|
|
|
|
|
|
|
|
return {
|
|
|
|
"user" : text.unescape(user[:-14]),
|
|
|
|
"gallery": {
|
|
|
|
"id" : text.parse_int(self.gallery_id),
|
|
|
|
"title": text.unescape(title),
|
|
|
|
"score": text.parse_int(score.partition("%")[0]),
|
|
|
|
"views": text.parse_int(views.partition(" ")[0]),
|
|
|
|
"tags" : text.split_html(tags)[2:],
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
def images(self):
|
|
|
|
url = "{}/album/show_album_json?album={}".format(
|
|
|
|
self.root, self.gallery_id)
|
|
|
|
response = self.request(url)
|
|
|
|
|
|
|
|
if response.content == b"Permission denied":
|
|
|
|
raise exception.AuthorizationError()
|
|
|
|
images = response.json()
|
|
|
|
key = end = self._first
|
|
|
|
|
|
|
|
while True:
|
|
|
|
img = images[key]
|
|
|
|
yield {
|
|
|
|
"url" : img["img_large"],
|
|
|
|
"caption": img["caption"],
|
|
|
|
"id" : text.parse_int(img["id"]),
|
|
|
|
"views" : text.parse_int(img["times_viewed"]),
|
|
|
|
"score" : text.parse_int(img["vote_percent"]),
|
|
|
|
}
|
2023-07-22 14:05:40 +02:00
|
|
|
key = str(img["next"])
|
2019-06-07 16:31:20 +02:00
|
|
|
if key == end:
|
|
|
|
return
|
|
|
|
|
|
|
|
|
2023-08-29 19:34:27 +02:00
|
|
|
class PornhubGifExtractor(PornhubExtractor):
|
|
|
|
"""Extractor for pornhub.com gifs"""
|
|
|
|
subcategory = "gif"
|
|
|
|
directory_fmt = ("{category}", "{user}", "gifs")
|
|
|
|
filename_fmt = "{id} {title}.{extension}"
|
|
|
|
archive_fmt = "{id}"
|
|
|
|
pattern = BASE_PATTERN + r"/gif/(\d+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.pornhub.com/gif/12345"
|
2023-08-29 19:34:27 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
PornhubExtractor.__init__(self, match)
|
|
|
|
self.gallery_id = match.group(1)
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
url = "{}/gif/{}".format(self.root, self.gallery_id)
|
|
|
|
extr = text.extract_from(self.request(url).text)
|
|
|
|
|
|
|
|
gif = {
|
|
|
|
"id" : self.gallery_id,
|
|
|
|
"tags" : extr("data-context-tag='", "'").split(","),
|
|
|
|
"title": extr('"name": "', '"'),
|
|
|
|
"url" : extr('"contentUrl": "', '"'),
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
extr('"uploadDate": "', '"'), "%Y-%m-%d"),
|
2024-03-22 18:00:20 +01:00
|
|
|
"viewkey" : extr('From this video: '
|
|
|
|
'<a href="/view_video.php?viewkey=', '"'),
|
|
|
|
"timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'),
|
2023-11-26 23:52:24 +01:00
|
|
|
"user" : text.remove_html(extr("Created by:", "</div>")),
|
2023-08-29 19:34:27 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
yield Message.Directory, gif
|
|
|
|
yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif)
|
|
|
|
|
|
|
|
|
2019-06-07 16:31:20 +02:00
|
|
|
class PornhubUserExtractor(PornhubExtractor):
|
2023-08-29 19:34:27 +02:00
|
|
|
"""Extractor for a pornhub user"""
|
2019-06-07 16:31:20 +02:00
|
|
|
subcategory = "user"
|
2023-08-29 19:34:27 +02:00
|
|
|
pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.pornhub.com/model/USER"
|
2023-08-29 19:34:27 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
PornhubExtractor.__init__(self, match)
|
|
|
|
self.user = match.group(1)
|
|
|
|
|
|
|
|
def initialize(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
base = "{}/{}/".format(self.root, self.user)
|
|
|
|
return self._dispatch_extractors((
|
|
|
|
(PornhubPhotosExtractor, base + "photos"),
|
|
|
|
(PornhubGifsExtractor , base + "gifs"),
|
|
|
|
), ("photos",))
|
|
|
|
|
|
|
|
|
|
|
|
class PornhubPhotosExtractor(PornhubExtractor):
|
|
|
|
"""Extractor for all galleries of a pornhub user"""
|
|
|
|
subcategory = "photos"
|
|
|
|
pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
|
|
|
|
"/(photos(?:/[^/?#]+)?)")
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.pornhub.com/model/USER/photos"
|
2019-06-07 16:31:20 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
PornhubExtractor.__init__(self, match)
|
2023-08-29 19:34:27 +02:00
|
|
|
self.user, self.path = match.groups()
|
2019-06-07 16:31:20 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
data = {"_extractor": PornhubGalleryExtractor}
|
2023-08-29 19:34:27 +02:00
|
|
|
for page in self._pagination(self.user, self.path):
|
2023-06-15 16:24:18 +02:00
|
|
|
gid = None
|
2023-08-29 19:34:27 +02:00
|
|
|
for gid in text.extract_iter(page, 'id="albumphoto', '"'):
|
2019-06-07 16:31:20 +02:00
|
|
|
yield Message.Queue, self.root + "/album/" + gid, data
|
2023-06-15 16:24:18 +02:00
|
|
|
if gid is None:
|
|
|
|
return
|
|
|
|
|
2023-08-29 19:34:27 +02:00
|
|
|
|
|
|
|
class PornhubGifsExtractor(PornhubExtractor):
|
|
|
|
"""Extractor for a pornhub user's gifs"""
|
|
|
|
subcategory = "gifs"
|
|
|
|
pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
|
|
|
|
"/(gifs(?:/[^/?#]+)?)")
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://www.pornhub.com/model/USER/gifs"
|
2023-08-29 19:34:27 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
PornhubExtractor.__init__(self, match)
|
|
|
|
self.user, self.path = match.groups()
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
data = {"_extractor": PornhubGifExtractor}
|
|
|
|
for page in self._pagination(self.user, self.path):
|
|
|
|
gid = None
|
|
|
|
for gid in text.extract_iter(page, 'id="gif', '"'):
|
|
|
|
yield Message.Queue, self.root + "/gif/" + gid, data
|
|
|
|
if gid is None:
|
|
|
|
return
|