mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-25 04:02:32 +01:00
[naverpost] add 'post' and 'user' extractors
This commit is contained in:
parent
4288cea94a
commit
757df3a845
@ -559,6 +559,12 @@ Consider all sites to be NSFW unless otherwise known.
|
||||
<td>Blogs, Posts</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>NaverPost</td>
|
||||
<td>https://post.naver.com/</td>
|
||||
<td>Posts, User Profiles</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>NaverWebtoon</td>
|
||||
<td>https://comic.naver.com/</td>
|
||||
|
@ -100,6 +100,7 @@ modules = [
|
||||
"myhentaigallery",
|
||||
"myportfolio",
|
||||
"naver",
|
||||
"naverpost",
|
||||
"naverwebtoon",
|
||||
"newgrounds",
|
||||
"nhentai",
|
||||
|
143
gallery_dl/extractor/naverpost.py
Normal file
143
gallery_dl/extractor/naverpost.py
Normal file
@ -0,0 +1,143 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extractors for https://post.naver.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, exception
|
||||
import json
|
||||
import re
|
||||
|
||||
BASE_PATTERN = r"(?:https?://)?(?:m\.)?post\.naver\.com"
|
||||
|
||||
|
||||
class NaverpostExtractor(Extractor):
|
||||
"""Base class for naver post extractors"""
|
||||
category = "naverpost"
|
||||
root = "https://post.naver.com"
|
||||
request_interval = (0.5, 1.5)
|
||||
|
||||
def _call(self, url, params=None):
|
||||
if params is None:
|
||||
params = {}
|
||||
while True:
|
||||
try:
|
||||
return self.request(url, params=params)
|
||||
except exception.HttpError as exc:
|
||||
if exc.status == 401:
|
||||
raise exception.AuthenticationError()
|
||||
if exc.status == 403:
|
||||
raise exception.AuthorizationError()
|
||||
if exc.status == 404:
|
||||
raise exception.NotFoundError(self.subcategory)
|
||||
self.log.debug(exc)
|
||||
return
|
||||
|
||||
def _pagination(self, url, params=None):
|
||||
if params is None:
|
||||
params = {}
|
||||
while True:
|
||||
res = self._call(url, params).text
|
||||
# the `html` string in the response contains escaped single quotes,
|
||||
# which would throw a JSONDecodeError exception
|
||||
res = json.loads(res.replace(r"\'", "'"))
|
||||
urls = []
|
||||
endpoints = text.extract_iter(
|
||||
res["html"], '<div class="text_area">\n<a href="', '"')
|
||||
for endpoint in endpoints:
|
||||
urls.append(self.root + endpoint)
|
||||
yield from urls
|
||||
if "nextFromNo" not in res:
|
||||
return
|
||||
params["fromNo"] = res["nextFromNo"]
|
||||
|
||||
|
||||
class NaverpostPostExtractor(NaverpostExtractor):
|
||||
"""Extractor for posts on post.naver.com"""
|
||||
subcategory = "post"
|
||||
filename_fmt = "{image[id]}.{extension}"
|
||||
directory_fmt = ("{category}", "{author}", "{volume_no}")
|
||||
archive_fmt = "{image[id]}"
|
||||
pattern = (BASE_PATTERN + r"/viewer/postView\.(naver|nhn)"
|
||||
r"\?volumeNo=(\d+)(?:&.+)?")
|
||||
example = "https://post.naver.com/viewer/postView.naver?volumeNo=12345"
|
||||
|
||||
def __init__(self, match):
|
||||
NaverpostExtractor.__init__(self, match)
|
||||
self.url = match.group(0)
|
||||
self.page_ext = match.group(1)
|
||||
self.volume_no = match.group(2)
|
||||
|
||||
def metadata(self, page):
|
||||
data = {
|
||||
"title": text.unescape(
|
||||
text.extr(page, '"og:title" content="', '"')),
|
||||
"description": text.unescape(
|
||||
text.extr(page, '"og:description" content="', '"')),
|
||||
"author": text.extr(page, '"og:author" content="', '"'),
|
||||
"date": text.parse_datetime(
|
||||
text.extr(page, '"og:createdate" content="', '"'),
|
||||
format="%Y.%m.%d. %H:%M:%S", utcoffset=9),
|
||||
"volume_no": self.volume_no,
|
||||
"views": text.parse_int(
|
||||
(text.extr(page, '<span class="post_view">', ' ') or
|
||||
text.extr(page, '<span class="se_view" style="">', ' ')
|
||||
).replace(",", "")),
|
||||
"url": self.url,
|
||||
}
|
||||
return data
|
||||
|
||||
def items(self):
|
||||
page = self._call(self.url).text
|
||||
data = self.metadata(page)
|
||||
|
||||
yield Message.Directory, data
|
||||
|
||||
image_classes = ("img_attachedfile", "se_mediaImage")
|
||||
image_query = r"\?type=w\d+$"
|
||||
for image in text.extract_iter(page, "<img", ">"):
|
||||
img = {
|
||||
"id": text.extr(image, ' id="', '"'),
|
||||
"title": text.extr(image, ' title="', '"'),
|
||||
"attachment-id": text.extr(
|
||||
image, ' data-attachment-id="', '"'),
|
||||
"alt": None,
|
||||
}
|
||||
classes = text.extr(image, ' class="', '"').split()
|
||||
if not any(item in classes for item in image_classes):
|
||||
continue
|
||||
url = text.extr(image, ' data-src="', '"')
|
||||
if not re.search(image_query, url):
|
||||
continue
|
||||
url = re.sub(image_query, "", url)
|
||||
img["url"] = url
|
||||
alt = text.extr(image, ' alt="', '"')
|
||||
if alt and alt.endswith(".jpg"):
|
||||
img["alt"] = alt
|
||||
data["filename"], _, data["extension"] = alt.rpartition(".")
|
||||
else:
|
||||
text.nameext_from_url(text.unquote(url), data)
|
||||
data["image"] = img
|
||||
yield Message.Url, url, data
|
||||
|
||||
|
||||
class NaverpostUserExtractor(NaverpostExtractor):
|
||||
"""Extractor for all posts from a user on post.naver.com"""
|
||||
subcategory = "user"
|
||||
pattern = BASE_PATTERN + r"/my\.naver\?memberNo=(\d+)"
|
||||
example = "https://post.naver.com/my.naver?memberNo=12345"
|
||||
|
||||
def __init__(self, match):
|
||||
NaverpostExtractor.__init__(self, match)
|
||||
self.member_no = match.group(1)
|
||||
|
||||
def items(self):
|
||||
data = {"_extractor": NaverpostPostExtractor}
|
||||
endpoint = self.root + "/async/my.naver"
|
||||
params = {"memberNo": self.member_no}
|
||||
posts = self._pagination(endpoint, params)
|
||||
for url in posts:
|
||||
yield Message.Queue, url, data
|
@ -89,6 +89,7 @@ CATEGORY_MAP = {
|
||||
"mastodon.social": "mastodon.social",
|
||||
"myhentaigallery": "My Hentai Gallery",
|
||||
"myportfolio" : "Adobe Portfolio",
|
||||
"naverpost" : "NaverPost",
|
||||
"naverwebtoon" : "NaverWebtoon",
|
||||
"nhentai" : "nhentai",
|
||||
"nijie" : "nijie",
|
||||
|
53
test/results/naverpost.py
Normal file
53
test/results/naverpost.py
Normal file
@ -0,0 +1,53 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
from gallery_dl.extractor import naverpost
|
||||
|
||||
IMAGE_URL_PATTERN = r"(?i)https://post-phinf\.pstatic\.net/.*\.(?:gif|jpe?g|png|webp)"
|
||||
|
||||
|
||||
__tests__ = (
|
||||
{
|
||||
"#url": "https://m.post.naver.com/viewer/postView.nhn?volumeNo=15861102&memberNo=16220685",
|
||||
"#comment": ".nhn page extension",
|
||||
"#category": ("", "naverpost", "post"),
|
||||
"#class": naverpost.NaverpostPostExtractor,
|
||||
"#pattern": IMAGE_URL_PATTERN,
|
||||
"#count": 34,
|
||||
|
||||
"title": "[쇼! 음악중심] 180526 방탄소년단 FAKE LOVE 현장 포토",
|
||||
"description": "[BY MBC예능연구소] [쇼! 음악중심] 589회, 20180526 ※본 콘텐츠는 상업적 용도의 사용을 금합니다.",
|
||||
"author": "MBC예능연구소",
|
||||
"date": "dt:2018-05-29 12:09:34",
|
||||
"views": int,
|
||||
},
|
||||
|
||||
{
|
||||
"#url": "https://post.naver.com/viewer/postView.naver?volumeNo=31389956&memberNo=29156514",
|
||||
"#comment": ".naver page extension",
|
||||
"#category": ("", "naverpost", "post"),
|
||||
"#class": naverpost.NaverpostPostExtractor,
|
||||
"#pattern": IMAGE_URL_PATTERN,
|
||||
"#count": 48,
|
||||
|
||||
"title": "매일 밤 꿈꿔 왔던 드림캐쳐 '바람아' 활동 비하인드 현장",
|
||||
"description": "[BY 드림캐쳐컴퍼니] 안녕하세요.드림캐쳐 포스트 지기입니다!(*・▽・*)'Odd Eye' 활동이 끝나고 아쉬웠을...",
|
||||
"author": "드림캐쳐컴퍼니",
|
||||
"date": "dt:2021-05-03 06:00:09",
|
||||
"views": int,
|
||||
},
|
||||
|
||||
{
|
||||
"#url": "https://post.naver.com/my.naver?memberNo=29156514",
|
||||
"#comment": "up to 20 posts are returned per request",
|
||||
"#category": ("", "naverpost", "user"),
|
||||
"#class": naverpost.NaverpostUserExtractor,
|
||||
"#pattern": naverpost.NaverpostPostExtractor.pattern,
|
||||
"#range": "1-21",
|
||||
"#count": 21,
|
||||
},
|
||||
|
||||
)
|
Loading…
Reference in New Issue
Block a user