[gelbooru] tag-splitting for non-api mode

2024-11-22 10:42:34 +01:00 · 2018-07-06 15:18:49 +02:00 · 2018-07-06 15:18:49 +02:00 · 1d43cbbf52
commit 1d43cbbf52
parent 2eefaa99a3
4 changed files with 15 additions and 15 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -621,6 +621,8 @@ extractor.3dbooru.tags
 ----------------------
 extractor.e621.tags
 -------------------
+extractor.gelbooru.tags
+-----------------------
 extractor.konachan.tags
 -----------------------
 extractor.rule34.tags
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@ -31,10 +31,7 @@ class BooruExtractor(SharedConfigExtractor):
    def __init__(self, match):
        super().__init__()
        self.params = {}
-        self.prepare = None
-
-        if self.post_url and self.config("tags", False):
-            self.prepare = self._extended_tags
+        self.extags = self.post_url and self.config("tags", False)

    def skip(self, num):
        pages = num // self.per_page
@ -62,8 +59,8 @@ class BooruExtractor(SharedConfigExtractor):
                if url.startswith("/"):
                    url = text.urljoin(self.api_url, url)
                image.update(data)
-                if self.prepare:
-                    self.prepare(image)
+                if self.extags:
+                    self.extended_tags(image)
                yield Message.Url, url, text.nameext_from_url(url, image)

            if len(images) < self.per_page:
@ -89,17 +86,16 @@ class BooruExtractor(SharedConfigExtractor):
        """Collect metadata for extractor-job"""
        return {}

-    def _extended_tags(self, image):
+    def extended_tags(self, image, page=None):
        """Rerieve extended tag information"""
-        url = self.post_url.format(image["id"])
-        page = self.request(url).text
-        tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
-
+        if not page:
+            url = self.post_url.format(image["id"])
+            page = self.request(url).text
        tags = collections.defaultdict(list)
+        tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
        pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
-        for tag_type, tag_name in pattern.findall(tag_html):
+        for tag_type, tag_name in pattern.findall(tags_html):
            tags[tag_type].append(text.unquote(tag_name))
-
        for key, value in tags.items():
            image["tags_" + key] = " ".join(value)

@ -185,7 +181,7 @@ class GelbooruPoolMixin(PoolMixin):

        return {
            "pool": text.parse_int(self.pool),
-            "pool_name": text.unescape(name or ""),
+            "pool_name": text.unescape(name),
            "count": len(self.posts),
        }

--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -68,6 +68,8 @@ class GelbooruExtractor(booru.XmlParserMixin,
        data["rating"] = (data["rating"] or "?")[0].lower()
        data["tags"] = " ".join(
            [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
+        if self.extags:
+            self.extended_tags(data, page)
        return data


--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@ -68,7 +68,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
    pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
               r"/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+)"]
    test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
-        "url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5",
+        "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
        "keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",
    })]