mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-22 10:42:34 +01:00
generalize tag-splitting option (#92)
- extend functionality to other booru sites: - http://behoimi.org/ - https://konachan.com/ - https://e621.net/ - https://rule34.xxx/ - https://safebooru.org/ - https://yande.re/
This commit is contained in:
parent
188e956c4e
commit
4a57509392
@ -617,20 +617,23 @@ Description A (comma-separated) list of post types to extract images, etc. from.
|
||||
=========== =====
|
||||
|
||||
|
||||
extractor.3dbooru.tags
|
||||
----------------------
|
||||
extractor.e621.tags
|
||||
-------------------
|
||||
extractor.konachan.tags
|
||||
-----------------------
|
||||
extractor.rule34.tags
|
||||
---------------------
|
||||
extractor.safebooru.tags
|
||||
------------------------
|
||||
extractor.yandere.tags
|
||||
----------------------
|
||||
=========== =====
|
||||
Type ``bool``
|
||||
Default ``false``
|
||||
Description Split tags into different categories
|
||||
and provide the following additional metadata-entries:
|
||||
|
||||
- ``tags_artist``
|
||||
- ``tags_character``
|
||||
- ``tags_circle``
|
||||
- ``tags_copyright``
|
||||
- ``tags_faults``
|
||||
- ``tags_general``
|
||||
Description Categorize tags by their respective types
|
||||
and provide them as ``tags_<type>`` metadata fields.
|
||||
|
||||
Note: This requires 1 additional HTTP request for each post.
|
||||
=========== =====
|
||||
|
@ -15,6 +15,7 @@ class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
|
||||
"""Base class for 3dbooru extractors"""
|
||||
category = "3dbooru"
|
||||
api_url = "http://behoimi.org/post/index.json"
|
||||
post_url = "http://behoimi.org/post/show/{}"
|
||||
page_limit = 1000
|
||||
|
||||
def __init__(self, match):
|
||||
@ -53,6 +54,13 @@ class ThreedeebooruPostExtractor(booru.PostMixin,
|
||||
test = [("http://behoimi.org/post/show/140852", {
|
||||
"url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
|
||||
"content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
|
||||
"options": (("tags", True),),
|
||||
"keyword": {
|
||||
"tags_character": "furude_rika",
|
||||
"tags_copyright": "higurashi_no_naku_koro_ni",
|
||||
"tags_model": "himekawa_azuru",
|
||||
"tags_general": str,
|
||||
},
|
||||
})]
|
||||
|
||||
|
||||
|
@ -11,8 +11,10 @@
|
||||
from .common import SharedConfigExtractor, Message
|
||||
from .. import text
|
||||
from xml.etree import ElementTree
|
||||
import collections
|
||||
import datetime
|
||||
import operator
|
||||
import re
|
||||
|
||||
|
||||
class BooruExtractor(SharedConfigExtractor):
|
||||
@ -20,6 +22,7 @@ class BooruExtractor(SharedConfigExtractor):
|
||||
basecategory = "booru"
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
api_url = ""
|
||||
post_url = ""
|
||||
per_page = 50
|
||||
page_start = 1
|
||||
page_limit = None
|
||||
@ -28,6 +31,10 @@ class BooruExtractor(SharedConfigExtractor):
|
||||
def __init__(self, match):
|
||||
super().__init__()
|
||||
self.params = {}
|
||||
self.prepare = None
|
||||
|
||||
if self.post_url and self.config("tags", False):
|
||||
self.prepare = self._extended_tags
|
||||
|
||||
def skip(self, num):
|
||||
pages = num // self.per_page
|
||||
@ -50,17 +57,18 @@ class BooruExtractor(SharedConfigExtractor):
|
||||
for image in images:
|
||||
try:
|
||||
url = image["file_url"]
|
||||
if url.startswith("/"):
|
||||
url = text.urljoin(self.api_url, url)
|
||||
image.update(data)
|
||||
self.prepare(image)
|
||||
yield Message.Url, url, text.nameext_from_url(url, image)
|
||||
except KeyError:
|
||||
continue
|
||||
if url.startswith("/"):
|
||||
url = text.urljoin(self.api_url, url)
|
||||
image.update(data)
|
||||
if self.prepare:
|
||||
self.prepare(image)
|
||||
yield Message.Url, url, text.nameext_from_url(url, image)
|
||||
|
||||
if len(images) < self.per_page:
|
||||
return
|
||||
self.update_page(images[-1])
|
||||
self.update_page(image)
|
||||
|
||||
def reset_page(self):
|
||||
"""Initialize params to point to the first page"""
|
||||
@ -81,8 +89,19 @@ class BooruExtractor(SharedConfigExtractor):
|
||||
"""Collect metadata for extractor-job"""
|
||||
return {}
|
||||
|
||||
def prepare(self, image):
|
||||
"""Prepare and modify an 'image' object"""
|
||||
def _extended_tags(self, image):
|
||||
"""Rerieve extended tag information"""
|
||||
url = self.post_url.format(image["id"])
|
||||
page = self.request(url).text
|
||||
tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
||||
|
||||
tags = collections.defaultdict(list)
|
||||
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
|
||||
for tag_type, tag_name in pattern.findall(tag_html):
|
||||
tags[tag_type].append(text.unquote(tag_name))
|
||||
|
||||
for key, value in tags.items():
|
||||
image["tags_" + key] = " ".join(value)
|
||||
|
||||
|
||||
class XmlParserMixin():
|
||||
|
@ -15,6 +15,7 @@ class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
|
||||
"""Base class for e621 extractors"""
|
||||
category = "e621"
|
||||
api_url = "https://e621.net/post/index.json"
|
||||
post_url = "https://e621.net/post/show/{}"
|
||||
page_limit = 750
|
||||
|
||||
|
||||
@ -48,6 +49,12 @@ class E621PostExtractor(booru.PostMixin, E621Extractor):
|
||||
test = [("https://e621.net/post/show/535", {
|
||||
"url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
|
||||
"content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
|
||||
"options": (("tags", True),),
|
||||
"keyword": {
|
||||
"tags_artist": "anry",
|
||||
"tags_general": str,
|
||||
"tags_species": str,
|
||||
},
|
||||
})]
|
||||
|
||||
|
||||
|
@ -16,9 +16,10 @@ class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
|
||||
category = "konachan"
|
||||
|
||||
def __init__(self, match):
|
||||
root = "https://konachan." + match.group("tld")
|
||||
self.api_url = root + "/post.json"
|
||||
self.post_url = root + "/post/show/{}"
|
||||
super().__init__(match)
|
||||
self.api_url = "https://konachan.{tld}/post.json".format(
|
||||
tld=match.group("tld"))
|
||||
|
||||
|
||||
class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
|
||||
@ -26,10 +27,10 @@ class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
|
||||
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
|
||||
r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"]
|
||||
test = [
|
||||
("http://konachan.com/post?tags=patata", {
|
||||
("https://konachan.com/post?tags=patata", {
|
||||
"content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
|
||||
}),
|
||||
("http://konachan.net/post?tags=patata", None),
|
||||
("https://konachan.net/post?tags=patata", None),
|
||||
]
|
||||
|
||||
|
||||
@ -38,10 +39,10 @@ class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
|
||||
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
|
||||
r"/pool/show/(?P<pool>\d+)"]
|
||||
test = [
|
||||
("http://konachan.com/pool/show/95", {
|
||||
("https://konachan.com/pool/show/95", {
|
||||
"content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
|
||||
}),
|
||||
("http://konachan.net/pool/show/95", None),
|
||||
("https://konachan.net/pool/show/95", None),
|
||||
]
|
||||
|
||||
|
||||
@ -50,10 +51,17 @@ class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
|
||||
pattern = [r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
|
||||
r"/post/show/(?P<post>\d+)"]
|
||||
test = [
|
||||
("http://konachan.com/post/show/205189", {
|
||||
("https://konachan.com/post/show/205189", {
|
||||
"content": "674e75a753df82f5ad80803f575818b8e46e4b65",
|
||||
"options": (("tags", True),),
|
||||
"keyword": {
|
||||
"tags_artist": "patata",
|
||||
"tags_character": "clownpiece",
|
||||
"tags_copyright": "touhou",
|
||||
"tags_general": str,
|
||||
},
|
||||
}),
|
||||
("http://konachan.net/post/show/205189", None),
|
||||
("https://konachan.net/post/show/205189", None),
|
||||
]
|
||||
|
||||
|
||||
|
@ -17,6 +17,7 @@ class Rule34Extractor(booru.XmlParserMixin,
|
||||
"""Base class for rule34 extractors"""
|
||||
category = "rule34"
|
||||
api_url = "https://rule34.xxx/index.php"
|
||||
post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
|
||||
page_limit = 4000
|
||||
|
||||
def __init__(self, match):
|
||||
@ -28,7 +29,7 @@ class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
|
||||
"""Extractor for images from rule34.xxx based on search-tags"""
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
|
||||
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
|
||||
test = [("http://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
|
||||
test = [("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
|
||||
"content": "a01768c6f86f32eb7ebbdeb87c30b0d9968d7f97",
|
||||
"pattern": r"https?://(.?img\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
|
||||
"count": 2,
|
||||
@ -39,6 +40,14 @@ class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
|
||||
"""Extractor for single images from rule34.xxx"""
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
|
||||
r"\?page=post&s=view&id=(?P<post>\d+)")]
|
||||
test = [("http://rule34.xxx/index.php?page=post&s=view&id=1974854", {
|
||||
test = [("https://rule34.xxx/index.php?page=post&s=view&id=1974854", {
|
||||
"content": "fd2820df78fb937532da0a46f7af6cefc4dc94be",
|
||||
"options": (("tags", True),),
|
||||
"keyword": {
|
||||
"tags_artist": "danraku",
|
||||
"tags_character": "io_(pso2)",
|
||||
"tags_copyright": "phantasy_star phantasy_star_online_2",
|
||||
"tags_general": "blue_hair female",
|
||||
"tags_metadata": "absurdres highres",
|
||||
},
|
||||
})]
|
||||
|
@ -17,6 +17,7 @@ class SafebooruExtractor(booru.XmlParserMixin,
|
||||
"""Base class for safebooru extractors"""
|
||||
category = "safebooru"
|
||||
api_url = "https://safebooru.org/index.php"
|
||||
post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
|
||||
|
||||
def __init__(self, match):
|
||||
super().__init__(match)
|
||||
@ -27,7 +28,7 @@ class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
|
||||
"""Extractor for images from safebooru.org based on search-tags"""
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
|
||||
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")]
|
||||
test = [("http://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
|
||||
test = [("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
|
||||
"url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
|
||||
"content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
|
||||
})]
|
||||
@ -37,7 +38,14 @@ class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
|
||||
"""Extractor for single images from safebooru.org"""
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
|
||||
r"\?page=post&s=view&id=(?P<post>\d+)")]
|
||||
test = [("http://safebooru.org/index.php?page=post&s=view&id=1169132", {
|
||||
test = [("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
|
||||
"url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
|
||||
"content": "93b293b27dabd198afafabbaf87c49863ac82f27",
|
||||
"options": (("tags", True),),
|
||||
"keyword": {
|
||||
"tags_artist": "kawanakajima",
|
||||
"tags_character": "heath_ledger ronald_mcdonald the_joker",
|
||||
"tags_copyright": "dc_comics mcdonald's the_dark_knight",
|
||||
"tags_general": str,
|
||||
},
|
||||
})]
|
||||
|
@ -9,37 +9,13 @@
|
||||
"""Extract images from https://yande.re/"""
|
||||
|
||||
from . import booru
|
||||
from .. import text
|
||||
|
||||
|
||||
class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
|
||||
"""Base class for yandere extractors"""
|
||||
category = "yandere"
|
||||
api_url = "https://yande.re/post.json"
|
||||
|
||||
def __init__(self, match):
|
||||
super().__init__(match)
|
||||
if self.config("tags", False):
|
||||
self.prepare = self._categorize_tags
|
||||
|
||||
def _categorize_tags(self, image):
|
||||
url = "https://yande.re/post/show/{}".format(image["id"])
|
||||
page = self.request(url).text
|
||||
taghtml = text.extract(page, '<ul id="tag-sidebar">', '</ul>')[0]
|
||||
|
||||
pos = 0
|
||||
tags = {"artist": [], "copyright": [], "character": [],
|
||||
"circle": [], "faults": [], "general": []}
|
||||
|
||||
while True:
|
||||
tagtype, pos = text.extract(taghtml, "tag-type-", '"', pos)
|
||||
if not tagtype:
|
||||
break
|
||||
tagname, pos = text.extract(taghtml, "?tags=", '"', pos)
|
||||
tags[tagtype].append(text.unquote(tagname))
|
||||
|
||||
for key, value in tags.items():
|
||||
image["tags_" + key] = " ".join(value)
|
||||
post_url = "https://yande.re/post/show/{}"
|
||||
|
||||
|
||||
class YandereTagExtractor(booru.TagMixin, YandereExtractor):
|
||||
@ -69,8 +45,6 @@ class YanderePostExtractor(booru.PostMixin, YandereExtractor):
|
||||
"tags_artist": "sasaki_tamaru",
|
||||
"tags_circle": "softhouse_chara",
|
||||
"tags_copyright": "ouzoku",
|
||||
"tags_character": str,
|
||||
"tags_faults": str,
|
||||
"tags_general": str,
|
||||
},
|
||||
})]
|
||||
|
Loading…
Reference in New Issue
Block a user