gallery-dl/gallery_dl/extractor/mastodon.py

# -*- coding: utf-8 -*-

# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for Mastodon instances"""

from .common import BaseExtractor, Message
from .. import text, exception
from ..cache import cache


class MastodonExtractor(BaseExtractor):
    """Base class for mastodon extractors"""
    basecategory = "mastodon"
    directory_fmt = ("mastodon", "{instance}", "{account[username]}")
    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
    archive_fmt = "{media[id]}"

    def __init__(self, match):
        BaseExtractor.__init__(self, match)
        self.item = match.group(match.lastindex)

    def _init(self):
        self.instance = self.root.partition("://")[2]
        self.reblogs = self.config("reblogs", False)
        self.replies = self.config("replies", True)
        self.cards = self.config("cards", False)

    def items(self):
        for status in self.statuses():

            if self._check_moved:
                self._check_moved(status["account"])
            if not self.reblogs and status["reblog"]:
                self.log.debug("Skipping %s (reblog)", status["id"])
                continue
            if not self.replies and status["in_reply_to_id"]:
                self.log.debug("Skipping %s (reply)", status["id"])
                continue

            attachments = status["media_attachments"]
            del status["media_attachments"]

            if status["reblog"]:
                attachments.extend(status["reblog"]["media_attachments"])

            if self.cards:
                card = status.get("card")
                if card:
                    url = card.get("image")
                    if url:
                        card["weburl"] = card.get("url")
                        card["url"] = url
                        card["id"] = "card" + "".join(
                            url.split("/")[6:-2]).lstrip("0")
                        attachments.append(card)

            status["instance"] = self.instance
            acct = status["account"]["acct"]
            status["instance_remote"] = \
                acct.rpartition("@")[2] if "@" in acct else None

            status["count"] = len(attachments)
            status["tags"] = [tag["name"] for tag in status["tags"]]
            status["date"] = text.parse_datetime(
                status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")

            yield Message.Directory, status
            for status["num"], media in enumerate(attachments, 1):
                status["media"] = media
                url = media["url"]
                yield Message.Url, url, text.nameext_from_url(url, status)

    def statuses(self):
        """Return an iterable containing all relevant Status objects"""
        return ()

    def _check_moved(self, account):
        self._check_moved = None
        # Certain fediverse software (such as Iceshrimp and Sharkey) have a
        # null account "moved" field instead of not having it outright.
        # To handle this, check if the "moved" value is truthy instead
        # if only it exists.
        if account.get("moved"):
            self.log.warning("Account '%s' moved to '%s'",
                             account["acct"], account["moved"]["acct"])


BASE_PATTERN = MastodonExtractor.update({
    "mastodon.social": {
        "root"         : "https://mastodon.social",
        "pattern"      : r"mastodon\.social",
        "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
        "client-id"    : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
        "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
    },
    "pawoo": {
        "root"         : "https://pawoo.net",
        "pattern"      : r"pawoo\.net",
        "access-token" : "c12c9d275050bce0dc92169a28db09d7"
                         "0d62d0a75a8525953098c167eacd3668",
        "client-id"    : "978a25f843ec01e53d09be2c290cd75c"
                         "782bc3b7fdbd7ea4164b9f3c3780c8ff",
        "client-secret": "9208e3d4a7997032cf4f1b0e12e5df38"
                         "8428ef1fadb446dcfeb4f5ed6872d97b",
    },
    "baraag": {
        "root"         : "https://baraag.net",
        "pattern"      : r"baraag\.net",
        "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
        "client-id"    : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
        "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
    }
}) + "(?:/web)?"


class MastodonUserExtractor(MastodonExtractor):
    """Extractor for all images of an account/user"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"/(?:@|users/)([^/?#]+)(?:/media)?/?$"
    example = "https://mastodon.social/@USER"

    def statuses(self):
        api = MastodonAPI(self)

        return api.account_statuses(
            api.account_id_by_username(self.item),
            only_media=(
                not self.reblogs and
                not self.cards and
                not self.config("text-posts", False)
            ),
            exclude_replies=not self.replies,
        )


class MastodonBookmarkExtractor(MastodonExtractor):
    """Extractor for mastodon bookmarks"""
    subcategory = "bookmark"
    pattern = BASE_PATTERN + r"/bookmarks"
    example = "https://mastodon.social/bookmarks"

    def statuses(self):
        return MastodonAPI(self).account_bookmarks()


class MastodonFavoriteExtractor(MastodonExtractor):
    """Extractor for mastodon favorites"""
    subcategory = "favorite"
    pattern = BASE_PATTERN + r"/favourites"
    example = "https://mastodon.social/favourites"

    def statuses(self):
        return MastodonAPI(self).account_favorites()


class MastodonListExtractor(MastodonExtractor):
    """Extractor for mastodon lists"""
    subcategory = "list"
    pattern = BASE_PATTERN + r"/lists/(\w+)"
    example = "https://mastodon.social/lists/12345"

    def statuses(self):
        return MastodonAPI(self).timelines_list(self.item)


class MastodonHashtagExtractor(MastodonExtractor):
    """Extractor for mastodon hashtags"""
    subcategory = "hashtag"
    pattern = BASE_PATTERN + r"/tags/(\w+)"
    example = "https://mastodon.social/tags/NAME"

    def statuses(self):
        return MastodonAPI(self).timelines_tag(self.item)


class MastodonFollowingExtractor(MastodonExtractor):
    """Extractor for followed mastodon users"""
    subcategory = "following"
    pattern = BASE_PATTERN + r"/(?:@|users/)([^/?#]+)/following"
    example = "https://mastodon.social/@USER/following"

    def items(self):
        api = MastodonAPI(self)
        account_id = api.account_id_by_username(self.item)

        for account in api.account_following(account_id):
            account["_extractor"] = MastodonUserExtractor
            yield Message.Queue, account["url"], account


class MastodonStatusExtractor(MastodonExtractor):
    """Extractor for images from a status"""
    subcategory = "status"
    pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)"
    example = "https://mastodon.social/@USER/12345"

    def statuses(self):
        return (MastodonAPI(self).status(self.item),)


class MastodonAPI():
    """Minimal interface for the Mastodon API

    https://docs.joinmastodon.org/
    https://github.com/tootsuite/mastodon
    """

    def __init__(self, extractor):
        self.root = extractor.root
        self.extractor = extractor

        access_token = extractor.config("access-token")
        if access_token is None or access_token == "cache":
            access_token = _access_token_cache(extractor.instance)
        if not access_token:
            access_token = extractor.config_instance("access-token")

        if access_token:
            self.headers = {"Authorization": "Bearer " + access_token}
        else:
            self.headers = None

    def account_id_by_username(self, username):
        if username.startswith("id:"):
            return username[3:]

        try:
            return self.account_lookup(username)["id"]
        except Exception:
            # fall back to account search
            pass

        if "@" in username:
            handle = "@" + username
        else:
            handle = "@{}@{}".format(username, self.extractor.instance)

        for account in self.account_search(handle, 1):
            if account["acct"] == username:
                self.extractor._check_moved(account)
                return account["id"]
        raise exception.NotFoundError("account")

    def account_bookmarks(self):
        """Statuses the user has bookmarked"""
        endpoint = "/v1/bookmarks"
        return self._pagination(endpoint, None)

    def account_favorites(self):
        """Statuses the user has favourited"""
        endpoint = "/v1/favourites"
        return self._pagination(endpoint, None)

    def account_following(self, account_id):
        """Accounts which the given account is following"""
        endpoint = "/v1/accounts/{}/following".format(account_id)
        return self._pagination(endpoint, None)

    def account_lookup(self, username):
        """Quickly lookup a username to see if it is available"""
        endpoint = "/v1/accounts/lookup"
        params = {"acct": username}
        return self._call(endpoint, params).json()

    def account_search(self, query, limit=40):
        """Search for matching accounts by username or display name"""
        endpoint = "/v1/accounts/search"
        params = {"q": query, "limit": limit}
        return self._call(endpoint, params).json()

    def account_statuses(self, account_id, only_media=True,
                         exclude_replies=False):
        """Statuses posted to the given account"""
        endpoint = "/v1/accounts/{}/statuses".format(account_id)
        params = {"only_media"     : "true" if only_media else "false",
                  "exclude_replies": "true" if exclude_replies else "false"}
        return self._pagination(endpoint, params)

    def status(self, status_id):
        """Obtain information about a status"""
        endpoint = "/v1/statuses/" + status_id
        return self._call(endpoint).json()

    def timelines_list(self, list_id):
        """View statuses in the given list timeline"""
        endpoint = "/v1/timelines/list/" + list_id
        return self._pagination(endpoint, None)

    def timelines_tag(self, hashtag):
        """View public statuses containing the given hashtag"""
        endpoint = "/v1/timelines/tag/" + hashtag
        return self._pagination(endpoint, None)

    def _call(self, endpoint, params=None):
        if endpoint.startswith("http"):
            url = endpoint
        else:
            url = self.root + "/api" + endpoint

        while True:
            response = self.extractor.request(
                url, params=params, headers=self.headers, fatal=None)
            code = response.status_code

            if code < 400:
                return response
            if code == 401:
                raise exception.StopExtraction(
                    "Invalid or missing access token.\n"
                    "Run 'gallery-dl oauth:mastodon:%s' to obtain one.",
                    self.extractor.instance)
            if code == 404:
                raise exception.NotFoundError()
            if code == 429:
                self.extractor.wait(until=text.parse_datetime(
                    response.headers["x-ratelimit-reset"],
                    "%Y-%m-%dT%H:%M:%S.%fZ",
                ))
                continue
            raise exception.StopExtraction(response.json().get("error"))

    def _pagination(self, endpoint, params):
        url = endpoint
        while url:
            response = self._call(url, params)
            yield from response.json()

            url = response.links.get("next")
            if not url:
                return
            url = url["url"]
            params = None


@cache(maxage=36500*86400, keyarg=0)
def _access_token_cache(instance):
    return None