gallery-dl/gallery_dl/extractor/nozomi.py

# -*- coding: utf-8 -*-

# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://nozomi.la/"""

from .common import Extractor, Message
from .. import text


class NozomiExtractor(Extractor):
    """Base class for nozomi extractors"""
    category = "nozomi"
    root = "https://nozomi.la"
    filename_fmt = "{postid}.{extension}"
    archive_fmt = "{postid}"

    def items(self):
        yield Message.Version, 1

        data = self.metadata()
        self.session.headers["Origin"] = self.root
        self.session.headers["Referer"] = self.root + "/"

        for post_id in map(str, self.posts()):
            url = "https://j.nozomi.la/post/{}/{}/{}.json".format(
                post_id[-1], post_id[-3:-1], post_id)
            image = self.request(url).json()

            image["tags"] = self._list(image.get("general"))
            image["artist"] = self._list(image.get("artist"))
            image["copyright"] = self._list(image.get("copyright"))
            image["character"] = self._list(image.get("character"))
            image["is_video"] = bool(image.get("is_video"))
            image["date"] = text.parse_datetime(
                image["date"] + ":00", "%Y-%m-%d %H:%M:%S%z")
            image["url"] = text.urljoin(self.root, image["imageurl"])
            text.nameext_from_url(image["url"], image)
            image.update(data)

            for key in ("general", "imageurl", "imageurls"):
                if key in image:
                    del image[key]

            yield Message.Directory, image
            yield Message.Url, image["url"], image

    def metadata(self):
        return {}

    def posts(self):
        return ()

    @staticmethod
    def _list(src):
        if not src:
            return []
        return [x["tagname_display"] for x in src]

    @staticmethod
    def _unpack(b):
        for i in range(0, len(b), 4):
            yield (b[i] << 24) + (b[i+1] << 16) + (b[i+2] << 8) + b[i+3]


class NozomiPostExtractor(NozomiExtractor):
    """Extractor for individual posts on nozomi.la"""
    subcategory = "post"
    pattern = r"(?:https?://)?nozomi\.la/post/(\d+)"
    test = ("https://nozomi.la/post/3649262.html", {
        "url": "f4522adfc8159355fd0476de28761b5be0f02068",
        "content": "cd20d2c5149871a0b80a1b0ce356526278964999",
        "keyword": {
            "artist"   : ["hammer (sunset beach)"],
            "character": ["patchouli knowledge"],
            "copyright": ["touhou"],
            "dataid"   : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5cf5a",
            "date"     : "type:datetime",
            "extension": "jpg",
            "favorites": int,
            "filename" : str,
            "height"   : 768,
            "is_video" : False,
            "postid"   : 3649262,
            "source"   : "danbooru",
            "sourceid" : 2434215,
            "tags"     : list,
            "type"     : "jpg",
            "url"      : str,
            "width"    : 1024,
        },
    })

    def __init__(self, match):
        NozomiExtractor.__init__(self, match)
        self.post_id = match.group(1)

    def posts(self):
        return (self.post_id,)


class NozomiTagExtractor(NozomiExtractor):
    """Extractor for posts from tag searches on nozomi.la"""
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    archive_fmt = "t_{search_tags}_{postid}"
    pattern = r"(?:https?://)?nozomi\.la/tag/([^/?&#]+)-\d+\."
    test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", {
        "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$",
        "count": ">= 75",
        "range": "1-75",
    })

    def __init__(self, match):
        NozomiExtractor.__init__(self, match)
        self.tags = text.unquote(match.group(1)).lower()

    def metadata(self):
        return {"search_tags": self.tags}

    def posts(self):
        url = "https://n.nozomi.la/nozomi/{}.nozomi".format(self.tags)
        i = 0

        while True:
            headers = {"Range": "bytes={}-{}".format(i, i+255)}
            response = self.request(url, headers=headers)
            yield from self._unpack(response.content)

            i += 256
            cr = response.headers.get("Content-Range", "").rpartition("/")[2]
            if text.parse_int(cr, i) <= i:
                return


class NozomiSearchExtractor(NozomiExtractor):
    """Extractor for search results on nozomi.la"""
    subcategory = "search"
    directory_fmt = ("{category}", "{search_tags:J }")
    archive_fmt = "t_{search_tags}_{postid}"
    pattern = r"(?:https?://)?nozomi\.la/search\.html\?q=([^&#]+)"
    test = ("https://nozomi.la/search.html?q=hibiscus%203:4_ratio#1", {
        "count": ">= 5",
    })

    def __init__(self, match):
        NozomiExtractor.__init__(self, match)
        self.tags = text.unquote(match.group(1)).lower().split()

    def metadata(self):
        return {"search_tags": self.tags}

    def posts(self):
        result = set()

        for tag in self.tags:
            url = "https://j.nozomi.la/nozomi/{}.nozomi".format(tag)
            items = self._unpack(self.request(url).content)

            if result:
                result.intersection_update(items)
            else:
                result.update(items)

        return result