From 757df3a84541dc0b862c03aa7982cb0737b11427 Mon Sep 17 00:00:00 2001 From: Braden Hilton Date: Thu, 9 Nov 2023 17:16:51 +0000 Subject: [PATCH] [naverpost] add 'post' and 'user' extractors --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/naverpost.py | 143 ++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + test/results/naverpost.py | 53 +++++++++++ 5 files changed, 204 insertions(+) create mode 100644 gallery_dl/extractor/naverpost.py create mode 100644 test/results/naverpost.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a15566df..639cd082 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -559,6 +559,12 @@ Consider all sites to be NSFW unless otherwise known. Blogs, Posts + + NaverPost + https://post.naver.com/ + Posts, User Profiles + + NaverWebtoon https://comic.naver.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe34..c05425bc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -100,6 +100,7 @@ modules = [ "myhentaigallery", "myportfolio", "naver", + "naverpost", "naverwebtoon", "newgrounds", "nhentai", diff --git a/gallery_dl/extractor/naverpost.py b/gallery_dl/extractor/naverpost.py new file mode 100644 index 00000000..56de752c --- /dev/null +++ b/gallery_dl/extractor/naverpost.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://post.naver.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json +import re + +BASE_PATTERN = r"(?:https?://)?(?:m\.)?post\.naver\.com" + + +class NaverpostExtractor(Extractor): + """Base class for naver post extractors""" + category = "naverpost" + root = "https://post.naver.com" + request_interval = (0.5, 1.5) + + def _call(self, url, params=None): + if params is None: + params = {} + while True: + try: + return self.request(url, params=params) + except exception.HttpError as exc: + if exc.status == 401: + raise exception.AuthenticationError() + if exc.status == 403: + raise exception.AuthorizationError() + if exc.status == 404: + raise exception.NotFoundError(self.subcategory) + self.log.debug(exc) + return + + def _pagination(self, url, params=None): + if params is None: + params = {} + while True: + res = self._call(url, params).text + # the `html` string in the response contains escaped single quotes, + # which would throw a JSONDecodeError exception + res = json.loads(res.replace(r"\'", "'")) + urls = [] + endpoints = text.extract_iter( + res["html"], '
\n', ' ') or + text.extr(page, '', ' ') + ).replace(",", "")), + "url": self.url, + } + return data + + def items(self): + page = self._call(self.url).text + data = self.metadata(page) + + yield Message.Directory, data + + image_classes = ("img_attachedfile", "se_mediaImage") + image_query = r"\?type=w\d+$" + for image in text.extract_iter(page, ""): + img = { + "id": text.extr(image, ' id="', '"'), + "title": text.extr(image, ' title="', '"'), + "attachment-id": text.extr( + image, ' data-attachment-id="', '"'), + "alt": None, + } + classes = text.extr(image, ' class="', '"').split() + if not any(item in classes for item in image_classes): + continue + url = text.extr(image, ' data-src="', '"') + if not re.search(image_query, url): + continue + url = re.sub(image_query, "", url) + img["url"] = url + alt = text.extr(image, ' alt="', '"') + if alt and alt.endswith(".jpg"): + img["alt"] = alt + data["filename"], _, data["extension"] = alt.rpartition(".") + else: + text.nameext_from_url(text.unquote(url), data) + data["image"] = img + yield Message.Url, url, data + + +class NaverpostUserExtractor(NaverpostExtractor): + """Extractor for all posts from a user on post.naver.com""" + subcategory = "user" + pattern = BASE_PATTERN + r"/my\.naver\?memberNo=(\d+)" + example = "https://post.naver.com/my.naver?memberNo=12345" + + def __init__(self, match): + NaverpostExtractor.__init__(self, match) + self.member_no = match.group(1) + + def items(self): + data = {"_extractor": NaverpostPostExtractor} + endpoint = self.root + "/async/my.naver" + params = {"memberNo": self.member_no} + posts = self._pagination(endpoint, params) + for url in posts: + yield Message.Queue, url, data diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 470b629d..ea7e9303 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -89,6 +89,7 @@ CATEGORY_MAP = { "mastodon.social": "mastodon.social", "myhentaigallery": "My Hentai Gallery", "myportfolio" : "Adobe Portfolio", + "naverpost" : "NaverPost", "naverwebtoon" : "NaverWebtoon", "nhentai" : "nhentai", "nijie" : "nijie", diff --git a/test/results/naverpost.py b/test/results/naverpost.py new file mode 100644 index 00000000..8332d937 --- /dev/null +++ b/test/results/naverpost.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import naverpost + +IMAGE_URL_PATTERN = r"(?i)https://post-phinf\.pstatic\.net/.*\.(?:gif|jpe?g|png|webp)" + + +__tests__ = ( +{ + "#url": "https://m.post.naver.com/viewer/postView.nhn?volumeNo=15861102&memberNo=16220685", + "#comment": ".nhn page extension", + "#category": ("", "naverpost", "post"), + "#class": naverpost.NaverpostPostExtractor, + "#pattern": IMAGE_URL_PATTERN, + "#count": 34, + + "title": "[쇼! 음악중심] 180526 방탄소년단 FAKE LOVE 현장 포토", + "description": "[BY MBC예능연구소] [쇼! 음악중심] 589회, 20180526 ※본 콘텐츠는 상업적 용도의 사용을 금합니다.", + "author": "MBC예능연구소", + "date": "dt:2018-05-29 12:09:34", + "views": int, +}, + +{ + "#url": "https://post.naver.com/viewer/postView.naver?volumeNo=31389956&memberNo=29156514", + "#comment": ".naver page extension", + "#category": ("", "naverpost", "post"), + "#class": naverpost.NaverpostPostExtractor, + "#pattern": IMAGE_URL_PATTERN, + "#count": 48, + + "title": "매일 밤 꿈꿔 왔던 드림캐쳐 '바람아' 활동 비하인드 현장", + "description": "[BY 드림캐쳐컴퍼니] 안녕하세요.드림캐쳐 포스트 지기입니다!(*・▽・*)'Odd Eye' 활동이 끝나고 아쉬웠을...", + "author": "드림캐쳐컴퍼니", + "date": "dt:2021-05-03 06:00:09", + "views": int, +}, + +{ + "#url": "https://post.naver.com/my.naver?memberNo=29156514", + "#comment": "up to 20 posts are returned per request", + "#category": ("", "naverpost", "user"), + "#class": naverpost.NaverpostUserExtractor, + "#pattern": naverpost.NaverpostPostExtractor.pattern, + "#range": "1-21", + "#count": 21, +}, + +)