1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-25 20:22:36 +01:00
gallery-dl/gallery_dl/extractor/postmill.py

204 lines
7.2 KiB
Python
Raw Normal View History

2023-12-12 10:54:34 +01:00
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Postmill instances"""
import re
from .common import BaseExtractor, Message
from .. import text, exception
class PostmillExtractor(BaseExtractor):
"""Base class for Postmill extractors"""
basecategory = "postmill"
directory_fmt = ("{category}", "{instance}", "{forum}")
filename_fmt = "{id}_{title[:220]}.{extension}"
archive_fmt = "{filename}"
def _init(self):
self.instance = self.root.partition("://")[2]
self.save_link_post_body = self.config("save-link-post-body", False)
self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search
self._search_image_tag = re.compile(
r'<a href="[^"]+"\n +class="submission__image-link"').search
def items(self):
for post_url in self.post_urls():
2023-12-23 03:28:36 +01:00
page = self.request(post_url).text
extr = text.extract_from(page)
2023-12-12 10:54:34 +01:00
title = text.unescape(extr(
'<meta property="og:title" content="', '">'))
date = text.parse_datetime(extr(
'<meta property="og:article:published_time" content="', '">'))
username = extr(
'<meta property="og:article:author" content="', '">')
post_canonical_url = text.unescape(extr(
'<link rel="canonical" href="', '">'))
url = text.unescape(extr(
'<h1 class="submission__title unheaderize inline"><a href="',
'"'))
body = extr(
'<div class="submission__body break-text text-flow">',
'</div>')
match = self._search_canonical_url(post_canonical_url)
forum = match.group(1)
id = int(match.group(2))
is_text_post = url.startswith("/")
2023-12-23 03:28:36 +01:00
is_image_post = self._search_image_tag(page) is not None
2023-12-12 10:54:34 +01:00
data = {
"title": title,
"date": date,
"username": username,
"forum": forum,
"id": id,
"flair": [text.unescape(i) for i in text.extract_iter(
2023-12-23 03:28:36 +01:00
page, '<span class="flair__label">', '</span>')],
2023-12-12 10:54:34 +01:00
"instance": self.instance,
}
urls = []
if is_text_post or self.save_link_post_body:
urls.append((Message.Url, "text:" + body))
if is_image_post:
urls.append((Message.Url, url))
elif not is_text_post:
urls.append((Message.Queue, url))
data["count"] = len(urls)
yield Message.Directory, data
for data["num"], (msg, url) in enumerate(urls, 1):
if url.startswith("text:"):
data["filename"], data["extension"] = "", "htm"
else:
data = text.nameext_from_url(url, data)
yield msg, url, data
class PostmillSubmissionsExtractor(PostmillExtractor):
"""Base class for Postmill submissions extractors"""
whitelisted_parameters = ()
def __init__(self, match):
PostmillExtractor.__init__(self, match)
2023-12-23 03:28:36 +01:00
groups = match.groups()
self.base = groups[-3]
self.sorting_path = groups[-2] or ""
2023-12-12 10:54:34 +01:00
self.query = {key: value for key, value in text.parse_query(
2023-12-23 03:28:36 +01:00
groups[-1]).items() if self.acceptable_query(key)}
2023-12-12 10:54:34 +01:00
def items(self):
url = self.root + self.base + self.sorting_path
while url:
2023-12-23 03:28:36 +01:00
response = self.request(url, params=self.query)
2023-12-12 10:54:34 +01:00
if response.history:
redirect_url = response.url
if redirect_url == self.root + "/login":
raise exception.StopExtraction(
"HTTP redirect to login page (%s)", redirect_url)
2023-12-23 03:28:36 +01:00
page = response.text
2023-12-12 10:54:34 +01:00
2023-12-23 03:28:36 +01:00
for nav in text.extract_iter(page,
2023-12-12 10:54:34 +01:00
'<nav class="submission__nav">',
'</nav>'):
post_url = text.unescape(text.extr(nav, '<a href="', '"'))
yield Message.Queue, text.urljoin(url, post_url), \
{"_extractor": PostmillPostExtractor}
2023-12-23 03:28:36 +01:00
url = text.unescape(text.extr(page,
2023-12-12 10:54:34 +01:00
'<link rel="next" href="', '">'))
def acceptable_query(self, key):
return key in self.whitelisted_parameters or key == "t" or \
(key.startswith("next[") and key.endswith("]"))
BASE_PATTERN = PostmillExtractor.update({
"raddle": {
"root" : None,
"pattern": (r"(?:raddle\.me|"
r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid"
r"\.onion)"),
}
})
2023-12-23 03:28:36 +01:00
QUERY_RE = r"(?:\?([^#]+))?$"
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
QUERY_RE
2023-12-12 10:54:34 +01:00
class PostmillPostExtractor(PostmillExtractor):
"""Extractor for a single submission URL"""
subcategory = "post"
2023-12-23 03:28:36 +01:00
pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
2023-12-12 10:54:34 +01:00
example = "https://raddle.me/f/FORUM/123/TITLE"
def __init__(self, match):
PostmillExtractor.__init__(self, match)
self.forum = match.group(3)
self.post_id = match.group(4)
def post_urls(self):
return (self.root + "/f/" + self.forum + "/" + self.post_id,)
class PostmillShortURLExtractor(PostmillExtractor):
"""Extractor for short submission URLs"""
subcategory = "shorturl"
pattern = BASE_PATTERN + r"/(\d+)$"
example = "https://raddle.me/123"
def __init__(self, match):
PostmillExtractor.__init__(self, match)
self.post_id = match.group(3)
def items(self):
url = self.root + "/" + self.post_id
response = self.request(url, method="HEAD", allow_redirects=False)
full_url = text.urljoin(url, response.headers["Location"])
yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor}
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
"""Extractor for the home page"""
subcategory = "home"
2023-12-23 03:28:36 +01:00
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
2023-12-12 10:54:34 +01:00
example = "https://raddle.me/"
class PostmillForumExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum"""
subcategory = "forum"
2023-12-23 03:28:36 +01:00
pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
2023-12-12 10:54:34 +01:00
example = "https://raddle.me/f/FORUM"
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions made by a user"""
subcategory = "usersubmissions"
2023-12-23 03:28:36 +01:00
pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
2023-12-12 10:54:34 +01:00
example = "https://raddle.me/user/USER/submissions"
class PostmillTagExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum with a specific tag"""
subcategory = "tag"
2023-12-23 03:28:36 +01:00
pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
2023-12-12 10:54:34 +01:00
example = "https://raddle.me/tag/TAG"
class PostmillSearchExtractor(PostmillSubmissionsExtractor):
"""Extractor for search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$"
example = "https://raddle.me/search?q=QUERY"
whitelisted_parameters = ("q",)