# -*- coding: utf-8 -*-
# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Gelbooru Beta 0.2 sites"""
from . import booru
from .. import text, util, exception
from xml.etree import ElementTree
import collections
import re
class GelbooruV02Extractor(booru.BooruExtractor):
basecategory = "gelbooru_v02"
def _init(self):
self.api_key = self.config("api-key")
self.user_id = self.config("user-id")
try:
self.api_root = INSTANCES[self.category]["api_root"]
except KeyError:
self.api_root = self.root
if self.category == "realbooru":
self.items = self._items_realbooru
self._tags = self._tags_realbooru
def _api_request(self, params):
url = self.api_root + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
def _pagination(self, params):
params["pid"] = self.page_start
params["limit"] = self.per_page
post = None
while True:
try:
root = self._api_request(params)
except ElementTree.ParseError:
if "tags" not in params or post is None:
raise
taglist = [tag for tag in params["tags"].split()
if not tag.startswith("id:<")]
taglist.append("id:<" + str(post.attrib["id"]))
params["tags"] = " ".join(taglist)
params["pid"] = 0
continue
post = None
for post in root:
yield post.attrib
if len(root) < self.per_page:
return
params["pid"] += 1
def _pagination_html(self, params):
url = self.root + "/index.php"
params["pid"] = self.page_start * self.per_page
data = {}
while True:
num_ids = 0
page = self.request(url, params=params).text
for data["id"] in text.extract_iter(page, '" id="p', '"'):
num_ids += 1
for post in self._api_request(data):
yield post.attrib
if num_ids < self.per_page:
return
params["pid"] += self.per_page
@staticmethod
def _prepare(post):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
def _html(self, post):
return self.request("{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])).text
def _tags(self, post, page):
tag_container = (text.extr(page, '
'))
if not tag_container:
return
tags = collections.defaultdict(list)
pattern = re.compile(
r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
for tag_type, tag_name in pattern.findall(tag_container):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
def _notes(self, post, page):
note_container = text.extr(page, 'id="note-container"', "", ""))),
})
def _file_url_realbooru(self, post):
url = post["file_url"]
md5 = post["md5"]
if md5 not in post["preview_url"] or url.count("/") == 5:
url = "{}/images/{}/{}/{}.{}".format(
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
def _items_realbooru(self):
from .common import Message
data = self.metadata()
for post in self.posts():
try:
html = self._html(post)
url = post["file_url"] = text.rextract(
html, 'href="', '"', html.index(">Original<"))[0]
except Exception:
self.log.debug("Unable to fetch download URL for post %s "
"(md5: %s)", post.get("id"), post.get("md5"))
continue
text.nameext_from_url(url, post)
post.update(data)
self._prepare(post)
self._tags(post, html)
yield Message.Directory, post
yield Message.Url, url, post
def _tags_realbooru(self, post, page):
tag_container = text.extr(page, 'id="tagLink"', '')
tags = collections.defaultdict(list)
pattern = re.compile(
r'Pool: ", "")
if not name:
raise exception.NotFoundError("pool")
self.post_ids = text.extract_iter(
page, 'class="thumb" id="p', '"', pos)
return {
"pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
}
def posts(self):
params = {}
for params["id"] in util.advance(self.post_ids, self.page_start):
for post in self._api_request(params):
yield post.attrib
def _posts_pages(self):
return self._pagination_html({
"page": "pool",
"s" : "show",
"id" : self.pool_id,
})
class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
subcategory = "favorite"
directory_fmt = ("{category}", "favorites", "{favorite_id}")
archive_fmt = "f_{favorite_id}_{id}"
per_page = 50
pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345"
def __init__(self, match):
GelbooruV02Extractor.__init__(self, match)
self.favorite_id = match.group(match.lastindex)
def metadata(self):
return {"favorite_id": text.parse_int(self.favorite_id)}
def posts(self):
return self._pagination_html({
"page": "favorites",
"s" : "view",
"id" : self.favorite_id,
})
class GelbooruV02PostExtractor(GelbooruV02Extractor):
subcategory = "post"
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
example = "https://safebooru.org/index.php?page=post&s=view&id=12345"
def __init__(self, match):
GelbooruV02Extractor.__init__(self, match)
self.post_id = match.group(match.lastindex)
def posts(self):
return self._pagination({"id": self.post_id})