gallery-dl/gallery_dl/extractor/itaku.py

# -*- coding: utf-8 -*-

# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://itaku.ee/"""

from .common import Extractor, Message
from ..cache import memcache
from .. import text

BASE_PATTERN = r"(?:https?://)?itaku\.ee"


class ItakuExtractor(Extractor):
    """Base class for itaku extractors"""
    category = "itaku"
    root = "https://itaku.ee"
    directory_fmt = ("{category}", "{owner_username}")
    filename_fmt = ("{id}{title:? //}.{extension}")
    archive_fmt = "{id}"
    request_interval = (0.5, 1.5)

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.item = match.group(1)

    def _init(self):
        self.api = ItakuAPI(self)
        self.videos = self.config("videos", True)

    def items(self):
        for post in self.posts():

            post["date"] = text.parse_datetime(
                post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")
            for category, tags in post.pop("categorized_tags").items():
                post["tags_" + category.lower()] = [t["name"] for t in tags]
            post["tags"] = [t["name"] for t in post["tags"]]

            sections = []
            for s in post["sections"]:
                group = s["group"]
                if group:
                    sections.append(group["title"] + "/" + s["title"])
                else:
                    sections.append(s["title"])
            post["sections"] = sections

            if post["video"] and self.videos:
                url = post["video"]["video"]
            else:
                url = post["image"]

            yield Message.Directory, post
            yield Message.Url, url, text.nameext_from_url(url, post)


class ItakuGalleryExtractor(ItakuExtractor):
    """Extractor for posts from an itaku user gallery"""
    subcategory = "gallery"
    pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery"
    example = "https://itaku.ee/profile/USER/gallery"

    def posts(self):
        return self.api.galleries_images(self.item)


class ItakuImageExtractor(ItakuExtractor):
    subcategory = "image"
    pattern = BASE_PATTERN + r"/images/(\d+)"
    example = "https://itaku.ee/images/12345"

    def posts(self):
        return (self.api.image(self.item),)


class ItakuAPI():

    def __init__(self, extractor):
        self.extractor = extractor
        self.root = extractor.root + "/api"
        self.headers = {
            "Accept": "application/json, text/plain, */*",
        }

    def galleries_images(self, username, section=None):
        endpoint = "/galleries/images/"
        params = {
            "cursor"    : None,
            "owner"     : self.user(username)["owner"],
            "section"   : section,
            "date_range": "",
            "maturity_rating": ("SFW", "Questionable", "NSFW"),
            "ordering"  : "-date_added",
            "page"      : "1",
            "page_size" : "30",
            "visibility": ("PUBLIC", "PROFILE_ONLY"),
        }
        return self._pagination(endpoint, params, self.image)

    def image(self, image_id):
        endpoint = "/galleries/images/{}/".format(image_id)
        return self._call(endpoint)

    @memcache(keyarg=1)
    def user(self, username):
        return self._call("/user_profiles/{}/".format(username))

    def _call(self, endpoint, params=None):
        if not endpoint.startswith("http"):
            endpoint = self.root + endpoint
        response = self.extractor.request(
            endpoint, params=params, headers=self.headers)
        return response.json()

    def _pagination(self, endpoint, params, extend):
        data = self._call(endpoint, params)

        while True:
            if extend:
                for result in data["results"]:
                    yield extend(result["id"])
            else:
                yield from data["results"]

            url_next = data["links"].get("next")
            if not url_next:
                return

            data = self._call(url_next)
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`# -- coding: utf-8 --`

remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`# Copyright 2022-2023 Mike Fährmann`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://itaku.ee/"""`

			`from .common import Extractor, Message`
			`from ..cache import memcache`
			`from .. import text`

			`BASE_PATTERN = r"(?:https?://)?itaku\.ee"`


			`class ItakuExtractor(Extractor):`
			`"""Base class for itaku extractors"""`
			`category = "itaku"`
			`root = "https://itaku.ee"`
			`directory_fmt = ("{category}", "{owner_username}")`
[itaku] add 'title' to default filenames (#1842) 2022-06-20 19:35:46 +02:00			`filename_fmt = ("{id}{title:? //}.{extension}")`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`archive_fmt = "{id}"`
			`request_interval = (0.5, 1.5)`

			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.item = match.group(1)`
decouple extractor initialization Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'. 2023-07-25 20:09:44 +02:00
			`def _init(self):`
			`self.api = ItakuAPI(self)`
[itaku] support videos (#1842) 2022-06-20 19:47:53 +02:00			`self.videos = self.config("videos", True)`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00
			`def items(self):`
			`for post in self.posts():`
[itaku] metadata cleanup (#1842) - parse 'date_added' as 'date' - simplify 'tags', 'categorized_tags', and 'sections' 2022-06-20 19:28:15 +02:00
			`post["date"] = text.parse_datetime(`
[itaku] fix 'date' parsing 2022-07-10 20:45:51 +02:00			`post["date_added"], "%Y-%m-%dT%H:%M:%S.%fZ")`
[itaku] metadata cleanup (#1842) - parse 'date_added' as 'date' - simplify 'tags', 'categorized_tags', and 'sections' 2022-06-20 19:28:15 +02:00			`for category, tags in post.pop("categorized_tags").items():`
			`post["tags_" + category.lower()] = [t["name"] for t in tags]`
			`post["tags"] = [t["name"] for t in post["tags"]]`
[itaku] categorize sections by group (#1842) 2022-06-29 22:34:07 +02:00
			`sections = []`
			`for s in post["sections"]:`
			`group = s["group"]`
			`if group:`
			`sections.append(group["title"] + "/" + s["title"])`
			`else:`
			`sections.append(s["title"])`
			`post["sections"] = sections`
[itaku] metadata cleanup (#1842) - parse 'date_added' as 'date' - simplify 'tags', 'categorized_tags', and 'sections' 2022-06-20 19:28:15 +02:00
[itaku] support videos (#1842) 2022-06-20 19:47:53 +02:00			`if post["video"] and self.videos:`
			`url = post["video"]["video"]`
			`else:`
			`url = post["image"]`

[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`yield Message.Directory, post`
			`yield Message.Url, url, text.nameext_from_url(url, post)`


			`class ItakuGalleryExtractor(ItakuExtractor):`
			`"""Extractor for posts from an itaku user gallery"""`
			`subcategory = "gallery"`
			`pattern = BASE_PATTERN + r"/profile/([^/?#]+)/gallery"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://itaku.ee/profile/USER/gallery"`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00
			`def posts(self):`
			`return self.api.galleries_images(self.item)`


			`class ItakuImageExtractor(ItakuExtractor):`
			`subcategory = "image"`
			`pattern = BASE_PATTERN + r"/images/(\d+)"`
remove test results in extractor modules and add generic example URLs 2023-09-11 16:30:55 +02:00			`example = "https://itaku.ee/images/12345"`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00
			`def posts(self):`
			`return (self.api.image(self.item),)`


			`class ItakuAPI():`

			`def __init__(self, extractor):`
			`self.extractor = extractor`
			`self.root = extractor.root + "/api"`
			`self.headers = {`
			`"Accept": "application/json, text/plain, /",`
			`}`

			`def galleries_images(self, username, section=None):`
			`endpoint = "/galleries/images/"`
			`params = {`
			`"cursor" : None,`
			`"owner" : self.user(username)["owner"],`
			`"section" : section,`
			`"date_range": "",`
[itaku] remove 'Extreme' rating (#3287) 2022-11-24 11:09:00 +01:00			`"maturity_rating": ("SFW", "Questionable", "NSFW"),`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`"ordering" : "-date_added",`
			`"page" : "1",`
			`"page_size" : "30",`
			`"visibility": ("PUBLIC", "PROFILE_ONLY"),`
			`}`
			`return self._pagination(endpoint, params, self.image)`

			`def image(self, image_id):`
[itaku] categorize sections by group (#1842) 2022-06-29 22:34:07 +02:00			`endpoint = "/galleries/images/{}/".format(image_id)`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`return self._call(endpoint)`

[itaku] fix caching bug (#1842) ItakuApi.user() would always return the first user it was called with, regardless of its 'username' argument. 2022-07-01 20:50:38 +02:00			`@memcache(keyarg=1)`
[itaku] add 'gallery' and 'image' extractors (#1842) 2022-06-15 22:51:40 +02:00			`def user(self, username):`
			`return self._call("/user_profiles/{}/".format(username))`

			`def _call(self, endpoint, params=None):`
			`if not endpoint.startswith("http"):`
			`endpoint = self.root + endpoint`
			`response = self.extractor.request(`
			`endpoint, params=params, headers=self.headers)`
			`return response.json()`

			`def _pagination(self, endpoint, params, extend):`
			`data = self._call(endpoint, params)`

			`while True:`
			`if extend:`
			`for result in data["results"]:`
			`yield extend(result["id"])`
			`else:`
			`yield from data["results"]`

			`url_next = data["links"].get("next")`
			`if not url_next:`
			`return`

			`data = self._call(url_next)`