2019-10-13 22:10:32 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-04-17 15:42:42 +02:00
|
|
|
# Copyright 2019-2023 Mike Fährmann
|
2019-10-13 22:10:32 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Extractors for https://nozomi.la/"""
|
|
|
|
|
|
|
|
from .common import Extractor, Message
|
|
|
|
from .. import text
|
|
|
|
|
|
|
|
|
2020-04-20 21:44:16 +02:00
|
|
|
def decode_nozomi(n):
|
|
|
|
for i in range(0, len(n), 4):
|
|
|
|
yield (n[i] << 24) + (n[i+1] << 16) + (n[i+2] << 8) + n[i+3]
|
|
|
|
|
|
|
|
|
2019-10-13 22:10:32 +02:00
|
|
|
class NozomiExtractor(Extractor):
|
|
|
|
"""Base class for nozomi extractors"""
|
|
|
|
category = "nozomi"
|
|
|
|
root = "https://nozomi.la"
|
2020-03-19 21:07:31 +01:00
|
|
|
filename_fmt = "{postid} {dataid}.{extension}"
|
|
|
|
archive_fmt = "{dataid}"
|
2019-10-13 22:10:32 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
|
|
|
|
data = self.metadata()
|
|
|
|
self.session.headers["Origin"] = self.root
|
|
|
|
self.session.headers["Referer"] = self.root + "/"
|
|
|
|
|
2019-10-14 23:49:46 +02:00
|
|
|
for post_id in map(str, self.posts()):
|
2019-10-13 22:10:32 +02:00
|
|
|
url = "https://j.nozomi.la/post/{}/{}/{}.json".format(
|
|
|
|
post_id[-1], post_id[-3:-1], post_id)
|
2019-10-17 23:05:04 +02:00
|
|
|
response = self.request(url, fatal=False)
|
2019-10-13 22:10:32 +02:00
|
|
|
|
2019-10-17 23:05:04 +02:00
|
|
|
if response.status_code >= 400:
|
|
|
|
self.log.warning(
|
|
|
|
"Skipping post %s ('%s %s')",
|
|
|
|
post_id, response.status_code, response.reason)
|
|
|
|
continue
|
|
|
|
|
2020-03-19 21:07:31 +01:00
|
|
|
post = response.json()
|
|
|
|
post["tags"] = self._list(post.get("general"))
|
|
|
|
post["artist"] = self._list(post.get("artist"))
|
|
|
|
post["copyright"] = self._list(post.get("copyright"))
|
|
|
|
post["character"] = self._list(post.get("character"))
|
2020-12-07 00:08:53 +01:00
|
|
|
|
|
|
|
try:
|
|
|
|
post["date"] = text.parse_datetime(
|
|
|
|
post["date"] + ":00", "%Y-%m-%d %H:%M:%S%z")
|
|
|
|
except Exception:
|
|
|
|
post["date"] = None
|
|
|
|
|
2020-03-19 21:07:31 +01:00
|
|
|
post.update(data)
|
2019-10-13 22:10:32 +02:00
|
|
|
|
2020-03-19 21:07:31 +01:00
|
|
|
images = post["imageurls"]
|
2019-10-13 22:10:32 +02:00
|
|
|
for key in ("general", "imageurl", "imageurls"):
|
2020-03-19 21:07:31 +01:00
|
|
|
if key in post:
|
|
|
|
del post[key]
|
2019-10-13 22:10:32 +02:00
|
|
|
|
2020-03-19 21:07:31 +01:00
|
|
|
yield Message.Directory, post
|
2021-01-12 22:32:52 +01:00
|
|
|
for post["num"], image in enumerate(images, 1):
|
2022-10-13 23:01:14 +02:00
|
|
|
post["filename"] = post["dataid"] = did = image["dataid"]
|
|
|
|
post["is_video"] = video = bool(image.get("is_video"))
|
2023-04-17 15:42:42 +02:00
|
|
|
|
|
|
|
ext = image["type"]
|
|
|
|
if video:
|
|
|
|
subdomain = "v"
|
|
|
|
elif ext == "gif":
|
|
|
|
subdomain = "g"
|
|
|
|
else:
|
|
|
|
subdomain = "w"
|
|
|
|
ext = "webp"
|
|
|
|
|
|
|
|
post["extension"] = ext
|
2022-10-13 23:01:14 +02:00
|
|
|
post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format(
|
2023-04-17 15:42:42 +02:00
|
|
|
subdomain, did[-1], did[-3:-1], did, ext)
|
2020-03-19 21:07:31 +01:00
|
|
|
yield Message.Url, url, post
|
2019-10-13 22:10:32 +02:00
|
|
|
|
2021-03-11 01:06:47 +01:00
|
|
|
def posts(self):
|
|
|
|
url = "https://n.nozomi.la" + self.nozomi
|
|
|
|
offset = (text.parse_int(self.pnum, 1) - 1) * 256
|
|
|
|
|
|
|
|
while True:
|
|
|
|
headers = {"Range": "bytes={}-{}".format(offset, offset+255)}
|
|
|
|
response = self.request(url, headers=headers)
|
|
|
|
yield from decode_nozomi(response.content)
|
|
|
|
|
|
|
|
offset += 256
|
|
|
|
cr = response.headers.get("Content-Range", "").rpartition("/")[2]
|
|
|
|
if text.parse_int(cr, offset) <= offset:
|
|
|
|
return
|
|
|
|
|
2019-10-13 22:10:32 +02:00
|
|
|
def metadata(self):
|
|
|
|
return {}
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _list(src):
|
2020-03-19 21:07:31 +01:00
|
|
|
return [x["tagname_display"] for x in src] if src else ()
|
2019-10-13 22:10:32 +02:00
|
|
|
|
|
|
|
|
|
|
|
class NozomiPostExtractor(NozomiExtractor):
|
|
|
|
"""Extractor for individual posts on nozomi.la"""
|
|
|
|
subcategory = "post"
|
|
|
|
pattern = r"(?:https?://)?nozomi\.la/post/(\d+)"
|
2020-03-19 21:07:31 +01:00
|
|
|
test = (
|
|
|
|
("https://nozomi.la/post/3649262.html", {
|
2023-04-17 15:42:42 +02:00
|
|
|
"url": "e5525e717aec712843be8b88592d6406ae9e60ba",
|
|
|
|
"pattern": r"https://w\.nozomi\.la/2/15/aaa9f7c632cde1e1a5baaff3fb"
|
|
|
|
r"6a6d857ec73df7fdc5cf5a358caf604bf73152\.webp",
|
|
|
|
"content": "6d62c4a7fea50c0a89d499603c4e7a2b4b9bffa8",
|
2020-03-19 21:07:31 +01:00
|
|
|
"keyword": {
|
|
|
|
"artist" : ["hammer (sunset beach)"],
|
|
|
|
"character": ["patchouli knowledge"],
|
|
|
|
"copyright": ["touhou"],
|
|
|
|
"dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5",
|
|
|
|
"date" : "dt:2016-07-26 02:32:03",
|
2023-04-17 15:42:42 +02:00
|
|
|
"extension": "webp",
|
2020-03-19 21:07:31 +01:00
|
|
|
"filename" : str,
|
|
|
|
"height" : 768,
|
|
|
|
"is_video" : False,
|
|
|
|
"postid" : 3649262,
|
|
|
|
"tags" : list,
|
|
|
|
"type" : "jpg",
|
|
|
|
"url" : str,
|
|
|
|
"width" : 1024,
|
|
|
|
},
|
|
|
|
}),
|
|
|
|
# multiple images per post
|
|
|
|
("https://nozomi.la/post/25588032.html", {
|
2023-04-17 15:42:42 +02:00
|
|
|
"url": "fb956ccedcf2cf509739d26e2609e910244aa56c",
|
|
|
|
"keyword": "516ca5cbd0d2a46a8ce26679d6e08de5ac42184b",
|
2020-03-19 21:07:31 +01:00
|
|
|
"count": 7,
|
|
|
|
}),
|
2020-12-07 00:08:53 +01:00
|
|
|
# empty 'date' (#1163)
|
|
|
|
("https://nozomi.la/post/130309.html", {
|
|
|
|
"keyword": {"date": None},
|
2023-04-17 15:42:42 +02:00
|
|
|
}),
|
|
|
|
# gif
|
|
|
|
("https://nozomi.la/post/1647.html", {
|
|
|
|
"pattern": r"https://g\.nozomi\.la/a/f0/d1b06469e00d72e4f6346209c1"
|
|
|
|
r"49db459d76b58a074416c260ed93cc31fa9f0a\.gif",
|
|
|
|
"content": "952efb78252bbc9fb56df2e8fafb68d5e6364181",
|
|
|
|
}),
|
|
|
|
# video
|
|
|
|
("https://nozomi.la/post/2269847.html", {
|
|
|
|
"pattern": r"https://v\.nozomi\.la/d/0e/ff88398862669783691b31519f"
|
|
|
|
r"2bea3a35c24b6e62e3ba2d89b4409e41c660ed\.webm",
|
|
|
|
"content": "57065e6c16da7b1c7098a63b36fb0c6c6f1b9bca",
|
|
|
|
}),
|
2020-03-19 21:07:31 +01:00
|
|
|
)
|
2019-10-13 22:10:32 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
NozomiExtractor.__init__(self, match)
|
|
|
|
self.post_id = match.group(1)
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
return (self.post_id,)
|
|
|
|
|
|
|
|
|
2021-03-11 01:06:47 +01:00
|
|
|
class NozomiIndexExtractor(NozomiExtractor):
|
|
|
|
"""Extractor for the nozomi.la index"""
|
|
|
|
subcategory = "index"
|
|
|
|
pattern = (r"(?:https?://)?nozomi\.la/"
|
|
|
|
r"(?:(index(?:-Popular)?)-(\d+)\.html)?(?:$|#|\?)")
|
|
|
|
test = (
|
|
|
|
("https://nozomi.la/"),
|
|
|
|
("https://nozomi.la/index-2.html"),
|
|
|
|
("https://nozomi.la/index-Popular-33.html"),
|
|
|
|
)
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
NozomiExtractor.__init__(self, match)
|
|
|
|
index, self.pnum = match.groups()
|
|
|
|
self.nozomi = "/{}.nozomi".format(index or "index")
|
|
|
|
|
|
|
|
|
2019-10-13 22:10:32 +02:00
|
|
|
class NozomiTagExtractor(NozomiExtractor):
|
|
|
|
"""Extractor for posts from tag searches on nozomi.la"""
|
|
|
|
subcategory = "tag"
|
|
|
|
directory_fmt = ("{category}", "{search_tags}")
|
2021-05-04 18:08:02 +02:00
|
|
|
archive_fmt = "t_{search_tags}_{dataid}"
|
2021-03-11 01:06:47 +01:00
|
|
|
pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\."
|
2019-10-13 22:10:32 +02:00
|
|
|
test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", {
|
2023-04-17 15:42:42 +02:00
|
|
|
"pattern": r"^https://[wgv]\.nozomi\.la/\w/\w\w/\w+\.\w+$",
|
2020-03-17 22:28:23 +01:00
|
|
|
"count": ">= 25",
|
|
|
|
"range": "1-25",
|
2019-10-13 22:10:32 +02:00
|
|
|
})
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
NozomiExtractor.__init__(self, match)
|
2021-03-11 01:06:47 +01:00
|
|
|
tags, self.pnum = match.groups()
|
2021-09-16 16:28:18 +02:00
|
|
|
self.tags = text.unquote(tags)
|
2021-03-11 01:06:47 +01:00
|
|
|
self.nozomi = "/nozomi/{}.nozomi".format(self.tags)
|
2019-10-13 22:10:32 +02:00
|
|
|
|
|
|
|
def metadata(self):
|
|
|
|
return {"search_tags": self.tags}
|
|
|
|
|
2019-10-14 23:49:46 +02:00
|
|
|
|
|
|
|
class NozomiSearchExtractor(NozomiExtractor):
|
|
|
|
"""Extractor for search results on nozomi.la"""
|
|
|
|
subcategory = "search"
|
|
|
|
directory_fmt = ("{category}", "{search_tags:J }")
|
2021-05-04 18:08:02 +02:00
|
|
|
archive_fmt = "t_{search_tags}_{dataid}"
|
2019-10-14 23:49:46 +02:00
|
|
|
pattern = r"(?:https?://)?nozomi\.la/search\.html\?q=([^&#]+)"
|
|
|
|
test = ("https://nozomi.la/search.html?q=hibiscus%203:4_ratio#1", {
|
|
|
|
"count": ">= 5",
|
|
|
|
})
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
NozomiExtractor.__init__(self, match)
|
2021-09-16 16:28:18 +02:00
|
|
|
self.tags = text.unquote(match.group(1)).split()
|
2019-10-14 23:49:46 +02:00
|
|
|
|
|
|
|
def metadata(self):
|
|
|
|
return {"search_tags": self.tags}
|
|
|
|
|
|
|
|
def posts(self):
|
2022-07-13 16:59:42 +02:00
|
|
|
result = None
|
2022-07-14 14:59:11 +02:00
|
|
|
positive = []
|
|
|
|
negative = []
|
2019-10-14 23:49:46 +02:00
|
|
|
|
2019-10-17 22:37:09 +02:00
|
|
|
def nozomi(path):
|
|
|
|
url = "https://j.nozomi.la/" + path + ".nozomi"
|
2020-04-20 21:44:16 +02:00
|
|
|
return decode_nozomi(self.request(url).content)
|
2019-10-17 22:37:09 +02:00
|
|
|
|
2019-10-14 23:49:46 +02:00
|
|
|
for tag in self.tags:
|
2022-07-13 16:59:42 +02:00
|
|
|
(negative if tag[0] == "-" else positive).append(
|
|
|
|
tag.replace("/", ""))
|
2019-10-14 23:49:46 +02:00
|
|
|
|
2022-07-13 16:59:42 +02:00
|
|
|
for tag in positive:
|
|
|
|
ids = nozomi("nozomi/" + tag)
|
|
|
|
if result is None:
|
|
|
|
result = set(ids)
|
2019-10-14 23:49:46 +02:00
|
|
|
else:
|
2022-07-13 16:59:42 +02:00
|
|
|
result.intersection_update(ids)
|
|
|
|
|
2022-07-14 14:59:11 +02:00
|
|
|
if result is None:
|
|
|
|
result = set(nozomi("index"))
|
2022-07-13 16:59:42 +02:00
|
|
|
for tag in negative:
|
|
|
|
result.difference_update(nozomi("nozomi/" + tag[1:]))
|
2019-10-14 23:49:46 +02:00
|
|
|
|
2022-07-13 16:59:42 +02:00
|
|
|
return sorted(result, reverse=True) if result else ()
|