2021-10-26 20:00:41 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for https://skeb.jp/"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text
|
2022-03-28 10:29:24 +02:00
|
|
|
import itertools
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
|
|
|
|
class SkebExtractor(Extractor):
|
|
|
|
"""Base class for skeb extractors"""
|
|
|
|
category = "skeb"
|
|
|
|
directory_fmt = ("{category}", "{creator[screen_name]}")
|
|
|
|
filename_fmt = "{post_num}_{file_id}.{extension}"
|
2022-08-27 16:46:53 +02:00
|
|
|
archive_fmt = "{post_num}_{_file_id}_{content_category}"
|
2021-10-26 20:00:41 +02:00
|
|
|
root = "https://skeb.jp"
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
Extractor.__init__(self, match)
|
|
|
|
self.user_name = match.group(1)
|
2021-11-23 21:16:42 +01:00
|
|
|
self.thumbnails = self.config("thumbnails", False)
|
2022-07-29 16:32:00 +02:00
|
|
|
self.article = self.config("article", False)
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
def items(self):
|
2022-09-23 13:56:00 +02:00
|
|
|
metadata = self.metadata()
|
2022-02-28 22:42:15 +01:00
|
|
|
for user_name, post_num in self.posts():
|
|
|
|
response, post = self._get_post_data(user_name, post_num)
|
2022-09-23 13:56:00 +02:00
|
|
|
if metadata:
|
|
|
|
post.update(metadata)
|
2021-10-26 20:00:41 +02:00
|
|
|
yield Message.Directory, post
|
|
|
|
for data in self._get_urls_from_post(response, post):
|
|
|
|
url = data["file_url"]
|
|
|
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
"""Return post number"""
|
|
|
|
|
2022-09-23 13:56:00 +02:00
|
|
|
def metadata(self):
|
|
|
|
"""Return additional metadata"""
|
|
|
|
|
2022-03-28 10:29:24 +02:00
|
|
|
def _pagination(self, url, params):
|
2021-10-26 20:00:41 +02:00
|
|
|
headers = {"Referer": self.root, "Authorization": "Bearer null"}
|
2022-03-28 10:29:24 +02:00
|
|
|
params["offset"] = 0
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
while True:
|
|
|
|
posts = self.request(url, params=params, headers=headers).json()
|
|
|
|
|
|
|
|
for post in posts:
|
2022-03-28 10:29:24 +02:00
|
|
|
parts = post["path"].split("/")
|
|
|
|
user_name = parts[1][1:]
|
|
|
|
post_num = parts[3]
|
|
|
|
|
2021-10-26 20:00:41 +02:00
|
|
|
if post["private"]:
|
2022-02-28 22:42:15 +01:00
|
|
|
self.log.debug("Skipping @%s/%s (private)",
|
|
|
|
user_name, post_num)
|
2021-10-26 20:00:41 +02:00
|
|
|
continue
|
2022-02-28 22:42:15 +01:00
|
|
|
yield user_name, post_num
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
if len(posts) < 30:
|
2022-03-28 10:29:24 +02:00
|
|
|
return
|
2021-10-26 20:00:41 +02:00
|
|
|
params["offset"] += 30
|
|
|
|
|
2022-02-28 22:42:15 +01:00
|
|
|
def _get_post_data(self, user_name, post_num):
|
2021-10-26 20:00:41 +02:00
|
|
|
url = "{}/api/users/{}/works/{}".format(
|
2022-02-28 22:42:15 +01:00
|
|
|
self.root, user_name, post_num)
|
2021-10-26 20:00:41 +02:00
|
|
|
headers = {"Referer": self.root, "Authorization": "Bearer null"}
|
|
|
|
resp = self.request(url, headers=headers).json()
|
|
|
|
creator = resp["creator"]
|
|
|
|
post = {
|
2022-08-27 16:46:53 +02:00
|
|
|
"post_id" : resp["id"],
|
2021-10-26 20:00:41 +02:00
|
|
|
"post_num" : post_num,
|
|
|
|
"post_url" : self.root + resp["path"],
|
|
|
|
"body" : resp["body"],
|
|
|
|
"source_body" : resp["source_body"],
|
|
|
|
"translated_body" : resp["translated"],
|
|
|
|
"completed_at" : resp["completed_at"],
|
|
|
|
"date" : text.parse_datetime(
|
|
|
|
resp["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
|
|
"nsfw" : resp["nsfw"],
|
|
|
|
"anonymous" : resp["anonymous"],
|
|
|
|
"tags" : resp["tag_list"],
|
|
|
|
"genre" : resp["genre"],
|
|
|
|
"thanks" : resp["thanks"],
|
|
|
|
"source_thanks" : resp["source_thanks"],
|
|
|
|
"translated_thanks": resp["translated_thanks"],
|
|
|
|
"creator": {
|
|
|
|
"id" : creator["id"],
|
|
|
|
"name" : creator["name"],
|
|
|
|
"screen_name" : creator["screen_name"],
|
|
|
|
"avatar_url" : creator["avatar_url"],
|
|
|
|
"header_url" : creator["header_url"],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if not resp["anonymous"] and "client" in resp:
|
|
|
|
client = resp["client"]
|
|
|
|
post["client"] = {
|
|
|
|
"id" : client["id"],
|
|
|
|
"name" : client["name"],
|
|
|
|
"screen_name" : client["screen_name"],
|
|
|
|
"avatar_url" : client["avatar_url"],
|
|
|
|
"header_url" : client["header_url"],
|
|
|
|
}
|
|
|
|
return resp, post
|
|
|
|
|
|
|
|
def _get_urls_from_post(self, resp, post):
|
2021-11-23 21:16:42 +01:00
|
|
|
if self.thumbnails and "og_image_url" in resp:
|
2021-10-26 20:00:41 +02:00
|
|
|
post["content_category"] = "thumb"
|
|
|
|
post["file_id"] = "thumb"
|
2022-08-27 16:46:53 +02:00
|
|
|
post["_file_id"] = str(resp["id"]) + "t"
|
2021-10-26 20:00:41 +02:00
|
|
|
post["file_url"] = resp["og_image_url"]
|
|
|
|
yield post
|
|
|
|
|
2022-07-29 16:32:00 +02:00
|
|
|
if self.article and "article_image_url" in resp:
|
2022-08-24 22:10:01 +02:00
|
|
|
url = resp["article_image_url"]
|
|
|
|
if url:
|
|
|
|
post["content_category"] = "article"
|
|
|
|
post["file_id"] = "article"
|
2022-08-27 16:46:53 +02:00
|
|
|
post["_file_id"] = str(resp["id"]) + "a"
|
2022-08-24 22:10:01 +02:00
|
|
|
post["file_url"] = url
|
|
|
|
yield post
|
2022-07-29 16:32:00 +02:00
|
|
|
|
2021-10-26 20:00:41 +02:00
|
|
|
for preview in resp["previews"]:
|
|
|
|
post["content_category"] = "preview"
|
2022-08-27 16:46:53 +02:00
|
|
|
post["file_id"] = post["_file_id"] = preview["id"]
|
2021-10-26 20:00:41 +02:00
|
|
|
post["file_url"] = preview["url"]
|
|
|
|
info = preview["information"]
|
|
|
|
post["original"] = {
|
|
|
|
"width" : info["width"],
|
|
|
|
"height" : info["height"],
|
|
|
|
"byte_size" : info["byte_size"],
|
|
|
|
"duration" : info["duration"],
|
|
|
|
"frame_rate": info["frame_rate"],
|
|
|
|
"software" : info["software"],
|
|
|
|
"extension" : info["extension"],
|
|
|
|
"is_movie" : info["is_movie"],
|
|
|
|
"transcoder": info["transcoder"],
|
|
|
|
}
|
|
|
|
yield post
|
|
|
|
|
|
|
|
|
|
|
|
class SkebPostExtractor(SkebExtractor):
|
|
|
|
"""Extractor for a single skeb post"""
|
|
|
|
subcategory = "post"
|
|
|
|
pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/works/(\d+)"
|
2022-03-28 10:29:24 +02:00
|
|
|
test = ("https://skeb.jp/@kanade_cocotte/works/38", {
|
|
|
|
"count": 2,
|
|
|
|
"keyword": {
|
|
|
|
"anonymous": False,
|
|
|
|
"body": "re:はじめまして。私はYouTubeにてVTuberとして活動をしている湊ラ",
|
|
|
|
"client": {
|
2022-07-12 15:46:51 +02:00
|
|
|
"avatar_url": r"re:https://pbs.twimg.com/profile_images"
|
|
|
|
r"/\d+/\w+\.jpg",
|
|
|
|
"header_url": r"re:https://pbs.twimg.com/profile_banners"
|
|
|
|
r"/1375007870291300358/\d+/1500x500",
|
2022-03-28 10:29:24 +02:00
|
|
|
"id": 1196514,
|
2022-07-12 15:46:51 +02:00
|
|
|
"name": str,
|
2022-03-28 10:29:24 +02:00
|
|
|
"screen_name": "minato_ragi",
|
|
|
|
},
|
|
|
|
"completed_at": "2022-02-27T14:03:45.442Z",
|
|
|
|
"content_category": "preview",
|
|
|
|
"creator": {
|
|
|
|
"avatar_url": "https://pbs.twimg.com/profile_images"
|
|
|
|
"/1225470417063645184/P8_SiB0V.jpg",
|
|
|
|
"header_url": "https://pbs.twimg.com/profile_banners"
|
|
|
|
"/71243217/1647958329/1500x500",
|
|
|
|
"id": 159273,
|
|
|
|
"name": "イチノセ奏",
|
|
|
|
"screen_name": "kanade_cocotte",
|
|
|
|
},
|
|
|
|
"date": "dt:2022-02-27 14:03:45",
|
|
|
|
"file_id": int,
|
|
|
|
"file_url": str,
|
|
|
|
"genre": "art",
|
|
|
|
"nsfw": False,
|
|
|
|
"original": {
|
|
|
|
"byte_size": int,
|
|
|
|
"duration": None,
|
|
|
|
"extension": "re:psd|png",
|
|
|
|
"frame_rate": None,
|
|
|
|
"height": 3727,
|
|
|
|
"is_movie": False,
|
|
|
|
"width": 2810,
|
|
|
|
},
|
|
|
|
"post_num": "38",
|
|
|
|
"post_url": "https://skeb.jp/@kanade_cocotte/works/38",
|
|
|
|
"source_body": None,
|
|
|
|
"source_thanks": None,
|
|
|
|
"tags": list,
|
|
|
|
"thanks": None,
|
|
|
|
"translated_body": False,
|
|
|
|
"translated_thanks": None,
|
|
|
|
}
|
|
|
|
})
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
SkebExtractor.__init__(self, match)
|
|
|
|
self.post_num = match.group(2)
|
|
|
|
|
|
|
|
def posts(self):
|
2022-03-09 02:03:00 +01:00
|
|
|
return ((self.user_name, self.post_num),)
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
|
|
|
|
class SkebUserExtractor(SkebExtractor):
|
|
|
|
"""Extractor for all posts from a skeb user"""
|
|
|
|
subcategory = "user"
|
2022-03-28 10:29:24 +02:00
|
|
|
pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/?$"
|
|
|
|
test = ("https://skeb.jp/@kanade_cocotte", {
|
|
|
|
"pattern": r"https://skeb\.imgix\.net/uploads/origins/[\w-]+"
|
|
|
|
r"\?bg=%23fff&auto=format&txtfont=bold&txtshad=70"
|
|
|
|
r"&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150"
|
|
|
|
r"&txt=SAMPLE&w=800&s=\w+",
|
|
|
|
"range": "1-5",
|
|
|
|
})
|
2021-10-26 20:00:41 +02:00
|
|
|
|
|
|
|
def posts(self):
|
2022-03-28 10:29:24 +02:00
|
|
|
url = "{}/api/users/{}/works".format(self.root, self.user_name)
|
|
|
|
|
|
|
|
params = {"role": "creator", "sort": "date"}
|
|
|
|
posts = self._pagination(url, params)
|
|
|
|
|
|
|
|
if self.config("sent-requests", False):
|
|
|
|
params = {"role": "client", "sort": "date"}
|
|
|
|
posts = itertools.chain(posts, self._pagination(url, params))
|
|
|
|
|
|
|
|
return posts
|
2022-06-27 11:31:49 +02:00
|
|
|
|
|
|
|
|
2022-09-21 17:57:55 +02:00
|
|
|
class SkebSearchExtractor(SkebExtractor):
|
|
|
|
"""Extractor for skeb search results"""
|
|
|
|
subcategory = "search"
|
|
|
|
pattern = r"(?:https?://)?skeb\.jp/search\?q=([^&#]+)"
|
|
|
|
test = ("https://skeb.jp/search?q=bunny%20tree&t=works", {
|
|
|
|
"count": ">= 18",
|
2022-09-23 13:56:00 +02:00
|
|
|
"keyword": {"search_tags": "bunny tree"},
|
2022-09-21 17:57:55 +02:00
|
|
|
})
|
|
|
|
|
2022-09-23 13:56:00 +02:00
|
|
|
def metadata(self):
|
|
|
|
return {"search_tags": text.unquote(self.user_name)}
|
|
|
|
|
2022-09-21 17:57:55 +02:00
|
|
|
def posts(self):
|
|
|
|
url = "https://hb1jt3kre9-2.algolianet.com/1/indexes/*/queries"
|
|
|
|
params = {
|
|
|
|
"x-algolia-agent": "Algolia for JavaScript (4.13.1); Browser",
|
|
|
|
}
|
|
|
|
headers = {
|
|
|
|
"Origin": self.root,
|
|
|
|
"Referer": self.root + "/",
|
|
|
|
"x-algolia-api-key": "9a4ce7d609e71bf29e977925e4c6740c",
|
|
|
|
"x-algolia-application-id": "HB1JT3KRE9",
|
|
|
|
}
|
|
|
|
|
2022-09-26 15:17:28 +02:00
|
|
|
filters = self.config("filters")
|
|
|
|
if filters is None:
|
|
|
|
filters = ("genre:art OR genre:voice OR genre:novel OR "
|
|
|
|
"genre:video OR genre:music OR genre:correction")
|
|
|
|
elif not isinstance(filters, str):
|
|
|
|
filters = " OR ".join(filters)
|
|
|
|
|
2022-09-23 13:56:00 +02:00
|
|
|
page = 0
|
2022-09-26 15:17:28 +02:00
|
|
|
pams = "hitsPerPage=40&filters=" + text.quote(filters) + "&page="
|
2022-09-21 17:57:55 +02:00
|
|
|
|
|
|
|
request = {
|
|
|
|
"indexName": "Request",
|
|
|
|
"query": text.unquote(self.user_name),
|
|
|
|
"params": pams + str(page),
|
|
|
|
}
|
|
|
|
data = {"requests": (request,)}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
result = self.request(
|
|
|
|
url, method="POST", params=params, headers=headers, json=data,
|
|
|
|
).json()["results"][0]
|
|
|
|
|
|
|
|
for post in result["hits"]:
|
|
|
|
parts = post["path"].split("/")
|
2022-09-23 13:56:00 +02:00
|
|
|
yield parts[1][1:], parts[3]
|
2022-09-21 17:57:55 +02:00
|
|
|
|
|
|
|
if page >= result["nbPages"]:
|
|
|
|
return
|
|
|
|
page += 1
|
|
|
|
request["params"] = pams + str(page)
|
|
|
|
|
|
|
|
|
2022-06-27 11:31:49 +02:00
|
|
|
class SkebFollowingExtractor(SkebExtractor):
|
|
|
|
"""Extractor for all creators followed by a skeb user"""
|
|
|
|
subcategory = "following"
|
|
|
|
pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators"
|
|
|
|
test = ("https://skeb.jp/@user/following_creators",)
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
for user in self.users():
|
|
|
|
url = "{}/@{}".format(self.root, user["screen_name"])
|
|
|
|
user["_extractor"] = SkebUserExtractor
|
|
|
|
yield Message.Queue, url, user
|
|
|
|
|
|
|
|
def users(self):
|
|
|
|
url = "{}/api/users/{}/following_creators".format(
|
|
|
|
self.root, self.user_name)
|
|
|
|
params = {"sort": "date", "offset": 0, "limit": 90}
|
2022-09-23 13:56:00 +02:00
|
|
|
headers = {"Referer": self.root, "Authorization": "Bearer null"}
|
2022-06-27 11:31:49 +02:00
|
|
|
|
|
|
|
while True:
|
|
|
|
data = self.request(url, params=params, headers=headers).json()
|
|
|
|
yield from data
|
|
|
|
|
|
|
|
if len(data) < params["limit"]:
|
|
|
|
return
|
|
|
|
params["offset"] += params["limit"]
|