mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-24 19:52:32 +01:00
[subscribestar] Refactoring extractor and handling audio content
- New support for embedded audios - New support for external links compatible with yt-dlp - Add a content_type field at the post level for directory creation - Major rework of the logic - Added a check_if_supported_by_ytdlp helper function in util.py for yt-dlp external links handling
This commit is contained in:
parent
d2f50ecf09
commit
e5e752d928
@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2020-2023 Mike Fährmann
|
||||
# Copyright 2020-2024 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@ -17,6 +17,7 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)"
|
||||
|
||||
class SubscribestarExtractor(Extractor):
|
||||
"""Base class for subscribestar extractors"""
|
||||
|
||||
category = "subscribestar"
|
||||
root = "https://www.subscribestar.com"
|
||||
directory_fmt = ("{category}", "{author_name}")
|
||||
@ -74,6 +75,7 @@ class SubscribestarExtractor(Extractor):
|
||||
|
||||
response = self.request(
|
||||
url, method="POST", headers=headers, data=data, fatal=False)
|
||||
|
||||
if response.json().get("errors"):
|
||||
self.log.debug(response.json()["errors"])
|
||||
raise exception.AuthenticationError()
|
||||
@ -84,44 +86,98 @@ class SubscribestarExtractor(Extractor):
|
||||
if cookie.name.startswith("auth")
|
||||
}
|
||||
|
||||
def _media_from_post(self, html):
|
||||
def _extract_media(self, html, media_types):
|
||||
media = []
|
||||
media_config = {
|
||||
"gallery": ('data-gallery="', '"', self._process_gallery_item),
|
||||
"attachments": (
|
||||
'class="uploads-docs"',
|
||||
'data-role="post-edit_form"',
|
||||
self._process_attachment_item,
|
||||
),
|
||||
"link": ('data-href="', '"', self._process_media_item),
|
||||
"audio": ('<source src="', '" type="audio/',
|
||||
self._process_media_item),
|
||||
}
|
||||
|
||||
gallery = text.extr(html, 'data-gallery="', '"')
|
||||
if gallery:
|
||||
for item in util.json_loads(text.unescape(gallery)):
|
||||
if "/previews" in item["url"]:
|
||||
self._warn_preview()
|
||||
else:
|
||||
media.append(item)
|
||||
for key, config in media_types.items():
|
||||
if key in media_config:
|
||||
start, end, processor = media_config[key]
|
||||
segments = (
|
||||
text.extract_all(
|
||||
html,
|
||||
((key, start, end),),
|
||||
)[0],
|
||||
)
|
||||
for segment in segments:
|
||||
if segment[key]:
|
||||
content = processor(segment, key)
|
||||
if content:
|
||||
media.append(content)
|
||||
return media
|
||||
|
||||
attachments = text.extr(
|
||||
html, 'class="uploads-docs"', 'data-role="post-edit_form"')
|
||||
if attachments:
|
||||
for att in attachments.split('class="doc_preview"')[1:]:
|
||||
media.append({
|
||||
"id" : text.parse_int(text.extr(
|
||||
att, 'data-upload-id="', '"')),
|
||||
"name": text.unescape(text.extr(
|
||||
att, 'doc_preview-title">', '<')),
|
||||
"url" : text.unescape(text.extr(att, 'href="', '"')),
|
||||
"type": "attachment",
|
||||
})
|
||||
def _process_gallery_item(self, item, media_type):
|
||||
gallery_list = util.json_loads(text.unescape(item["gallery"]))
|
||||
for media in gallery_list:
|
||||
if "/previews" in media["url"]:
|
||||
self._warn_preview()
|
||||
return {"url": media["url"], "type": media_type}
|
||||
|
||||
def _process_attachment_item(self, item, media_type):
|
||||
return {
|
||||
"id": text.parse_int(text.extr(item, 'data-upload-id="', '"')),
|
||||
"name": text.unescape(text.extr(item, 'doc_preview-title">', "<")),
|
||||
"url": text.unescape(text.extr(item, 'href="', '"')),
|
||||
"type": media_type,
|
||||
}
|
||||
|
||||
def _process_media_item(self, item, media_type):
|
||||
if media_type == "link" and util.check_if_supported_by_ytdlp(
|
||||
item[media_type]):
|
||||
return {"url": "ytdl:" + item[media_type], "type": media_type}
|
||||
elif media_type == "audio":
|
||||
return {"url": item[media_type], "type": media_type}
|
||||
|
||||
def _media_from_post(self, html):
|
||||
media_types = {
|
||||
"gallery": True,
|
||||
"attachments": True,
|
||||
"link": True,
|
||||
"audio": True,
|
||||
}
|
||||
media = self._extract_media(html, media_types)
|
||||
return media
|
||||
|
||||
def _data_from_post(self, html):
|
||||
extr = text.extract_from(html)
|
||||
|
||||
links = (text.extract_all(html, (("url", 'data-href="', '"'),), )[0],)
|
||||
audios = (text.extract_all(html, (("url", '<source src="',
|
||||
'" type="audio/'),),)[0],)
|
||||
gallery = text.extr(html, 'data-gallery="', '"')
|
||||
|
||||
content_type = None
|
||||
if links and any(item["url"] for item in links):
|
||||
content_type = "link"
|
||||
if audios and any(item["url"] for item in audios):
|
||||
content_type = "audio"
|
||||
if gallery:
|
||||
for item in util.json_loads(text.unescape(gallery)):
|
||||
if item["type"] == "video":
|
||||
content_type = "video"
|
||||
break
|
||||
else:
|
||||
content_type = "image"
|
||||
|
||||
return {
|
||||
"post_id" : text.parse_int(extr('data-id="', '"')),
|
||||
"author_id" : text.parse_int(extr('data-user-id="', '"')),
|
||||
"author_name": text.unescape(extr('href="/', '"')),
|
||||
"author_nick": text.unescape(extr('>', '<')),
|
||||
"date" : self._parse_datetime(extr(
|
||||
'class="post-date">', '</').rpartition(">")[2]),
|
||||
"content" : (extr(
|
||||
'<div class="post-content', '<div class="post-uploads')
|
||||
.partition(">")[2]),
|
||||
"post_id" : text.parse_int(extr('data-id="', '"')),
|
||||
"author_id" : text.parse_int(extr('data-user-id="', '"')),
|
||||
"author_name" : text.unescape(extr('href="/', '"')),
|
||||
"author_nick" : text.unescape(extr(">", "<")),
|
||||
"date" : self._parse_datetime(
|
||||
extr('class="post-date">', "</").rpartition(">")[2]),
|
||||
"content" : extr("<body>\n", "\n</body>"),
|
||||
"content_type" : content_type,
|
||||
}
|
||||
|
||||
def _parse_datetime(self, dt):
|
||||
@ -166,17 +222,3 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
|
||||
def posts(self):
|
||||
url = "{}/posts/{}".format(self.root, self.item)
|
||||
return (self.request(url).text,)
|
||||
|
||||
def _data_from_post(self, html):
|
||||
extr = text.extract_from(html)
|
||||
return {
|
||||
"post_id" : text.parse_int(extr('data-id="', '"')),
|
||||
"author_name": text.unescape(extr('href="/', '"')),
|
||||
"author_id" : text.parse_int(extr('data-user-id="', '"')),
|
||||
"author_nick": text.unescape(extr('alt="', '"')),
|
||||
"date" : self._parse_datetime(extr(
|
||||
'<span class="star_link-types">', '<')),
|
||||
"content" : (extr(
|
||||
'<div class="post-content', '<div class="post-uploads')
|
||||
.partition(">")[2]),
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ import subprocess
|
||||
import urllib.parse
|
||||
from http.cookiejar import Cookie
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from . import text, version, exception
|
||||
from . import text, version, exception, ytdl, config
|
||||
|
||||
|
||||
def bencode(num, alphabet="0123456789"):
|
||||
@ -496,6 +496,15 @@ CODES = {
|
||||
}
|
||||
|
||||
|
||||
def check_if_supported_by_ytdlp(url):
|
||||
ytdl_module = ytdl.import_module(
|
||||
config.get(("extractor", "ytdl"), "module"))
|
||||
for ie in ytdl_module.extractor.gen_extractor_classes():
|
||||
if ie.suitable(url):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class HTTPBasicAuth():
|
||||
__slots__ = ("authorization",)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user