2015-04-11 00:16:17 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-01-13 16:32:32 +01:00
|
|
|
# Copyright 2015-2023 Mike Fährmann
|
2015-04-11 00:16:17 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
"""Extractors for *booru sites"""
|
|
|
|
|
2021-01-27 01:33:01 +01:00
|
|
|
from .common import BaseExtractor, Message
|
2021-02-17 00:12:51 +01:00
|
|
|
from .. import text
|
2020-12-24 01:04:44 +01:00
|
|
|
import operator
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
|
2021-01-27 01:33:01 +01:00
|
|
|
class BooruExtractor(BaseExtractor):
|
2020-12-08 18:31:59 +01:00
|
|
|
"""Base class for *booru extractors"""
|
2017-08-29 22:42:48 +02:00
|
|
|
basecategory = "booru"
|
2017-05-30 12:10:16 +02:00
|
|
|
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
2020-12-08 18:31:59 +01:00
|
|
|
page_start = 0
|
|
|
|
per_page = 100
|
2018-01-03 23:52:01 +01:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def items(self):
|
|
|
|
self.login()
|
|
|
|
data = self.metadata()
|
2020-12-24 01:04:44 +01:00
|
|
|
tags = self.config("tags", False)
|
2021-04-13 23:40:24 +02:00
|
|
|
notes = self.config("notes", False)
|
2022-10-31 12:01:19 +01:00
|
|
|
fetch_html = tags or notes
|
2020-12-24 01:04:44 +01:00
|
|
|
|
2023-01-13 16:32:32 +01:00
|
|
|
url_key = self.config("url")
|
|
|
|
if url_key:
|
2024-07-17 20:40:29 +02:00
|
|
|
if isinstance(url_key, (list, tuple)):
|
|
|
|
self._file_url = self._file_url_list
|
|
|
|
self._file_url_keys = url_key
|
|
|
|
else:
|
|
|
|
self._file_url = operator.itemgetter(url_key)
|
2023-01-13 16:32:32 +01:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
for post in self.posts():
|
|
|
|
try:
|
2020-12-24 01:04:44 +01:00
|
|
|
url = self._file_url(post)
|
2020-12-16 22:11:10 +01:00
|
|
|
if url[0] == "/":
|
|
|
|
url = self.root + url
|
2024-07-17 20:20:17 +02:00
|
|
|
except Exception as exc:
|
|
|
|
self.log.debug("%s: %s", exc.__class__.__name__, exc)
|
|
|
|
self.log.warning("Unable to fetch download URL for post %s "
|
|
|
|
"(md5: %s)", post.get("id"), post.get("md5"))
|
2020-12-08 18:31:59 +01:00
|
|
|
continue
|
2020-12-24 01:04:44 +01:00
|
|
|
|
2022-10-31 12:01:19 +01:00
|
|
|
if fetch_html:
|
|
|
|
html = self._html(post)
|
|
|
|
if tags:
|
|
|
|
self._tags(post, html)
|
|
|
|
if notes:
|
|
|
|
self._notes(post, html)
|
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
text.nameext_from_url(url, post)
|
2022-02-18 00:35:46 +01:00
|
|
|
post.update(data)
|
|
|
|
self._prepare(post)
|
2020-12-24 01:04:44 +01:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
yield Message.Directory, post
|
|
|
|
yield Message.Url, url, post
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def skip(self, num):
|
|
|
|
pages = num // self.per_page
|
|
|
|
self.page_start += pages
|
|
|
|
return pages * self.per_page
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def login(self):
|
|
|
|
"""Login and set necessary cookies"""
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def metadata(self):
|
|
|
|
"""Return a dict with general metadata"""
|
|
|
|
return ()
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def posts(self):
|
|
|
|
"""Return an iterable with post objects"""
|
|
|
|
return ()
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-24 01:04:44 +01:00
|
|
|
_file_url = operator.itemgetter("file_url")
|
|
|
|
|
2024-07-17 20:40:29 +02:00
|
|
|
def _file_url_list(self, post):
|
|
|
|
urls = (post[key] for key in self._file_url_keys if post.get(key))
|
|
|
|
post["_fallback"] = it = iter(urls)
|
|
|
|
return next(it)
|
|
|
|
|
2021-02-17 00:12:51 +01:00
|
|
|
def _prepare(self, post):
|
2022-10-31 12:01:19 +01:00
|
|
|
"""Prepare a 'post's metadata"""
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2022-10-31 12:01:19 +01:00
|
|
|
def _html(self, post):
|
|
|
|
"""Return HTML content of a post"""
|
2021-04-13 23:40:24 +02:00
|
|
|
|
2022-10-31 12:01:19 +01:00
|
|
|
def _tags(self, post, page):
|
|
|
|
"""Extract extended tag metadata"""
|
2021-04-13 23:40:24 +02:00
|
|
|
|
2022-10-31 12:01:19 +01:00
|
|
|
def _notes(self, post, page):
|
|
|
|
"""Extract notes metadata"""
|