merge #5037: [hatenablog] add support (#5036)

2024-11-25 04:02:32 +01:00 · 2024-01-13 00:57:21 +01:00 · 2024-01-13 00:57:21 +01:00 · 71e2c3e5a2
commit 71e2c3e5a2
parent b1c175fdd1 9f53daabb8
5 changed files with 324 additions and 1 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Folders</td>
    <td></td>
 </tr>
+<tr>
+    <td>HatenaBlog</td>
+    <td>https://hatenablog.com</td>
+    <td>Archive, Individual Posts, Home Feed, Search Results</td>
+    <td></td>
+</tr>
 <tr>
    <td>HBrowse</td>
    <td>https://www.hbrowse.com/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -53,6 +53,7 @@ modules = [
    "gelbooru_v01",
    "gelbooru_v02",
    "gofile",
+    "hatenablog",
    "hbrowse",
    "hentai2read",
    "hentaicosplays",
--- a/gallery_dl/extractor/hatenablog.py
+++ b/gallery_dl/extractor/hatenablog.py
@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hatenablog.com"""
+
+import re
+from .common import Extractor, Message
+from .. import text
+
+
+BASE_PATTERN = (
+    r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
+    r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
+    r"|hatenadiary\.com|hateblo\.jp)))"
+)
+QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
+
+
+class HatenablogExtractor(Extractor):
+    """Base class for HatenaBlog extractors"""
+    category = "hatenablog"
+    directory_fmt = ("{category}", "{domain}")
+    filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
+    archive_fmt = "{filename}"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.domain = match.group(1) or match.group(2)
+
+    def _init(self):
+        self._find_img = re.compile(r'<img +([^>]+)').finditer
+
+    def _handle_article(self, article: str):
+        extr = text.extract_from(article)
+        date = text.parse_datetime(extr('<time datetime="', '"'))
+        entry_link = text.unescape(extr('<a href="', '"'))
+        entry = entry_link.partition("/entry/")[2]
+        title = text.unescape(extr('>', '<'))
+        content = extr(
+            '<div class="entry-content hatenablog-entry">', '</div>')
+
+        images = []
+        for i in self._find_img(content):
+            attributes = i.group(1)
+            if 'class="hatena-fotolife"' not in attributes:
+                continue
+            image = text.unescape(text.extr(attributes, 'src="', '"'))
+            images.append(image)
+
+        data = {
+            "domain": self.domain,
+            "date": date,
+            "entry": entry,
+            "title": title,
+            "count": len(images),
+        }
+        yield Message.Directory, data
+        for data["num"], url in enumerate(images, 1):
+            yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class HatenablogEntriesExtractor(HatenablogExtractor):
+    """Base class for a list of entries"""
+    allowed_parameters = ()
+
+    def __init__(self, match):
+        HatenablogExtractor.__init__(self, match)
+        self.path = match.group(3)
+        self.query = {key: value for key, value in text.parse_query(
+            match.group(4)).items() if self._acceptable_query(key)}
+
+    def _init(self):
+        HatenablogExtractor._init(self)
+        self._find_pager_url = re.compile(
+            r' class="pager-next">\s*<a href="([^"]+)').search
+
+    def items(self):
+        url = "https://" + self.domain + self.path
+        query = self.query
+
+        while url:
+            page = self.request(url, params=query).text
+
+            extr = text.extract_from(page)
+            attributes = extr('<body ', '>')
+            if "page-archive" in attributes:
+                yield from self._handle_partial_articles(extr)
+            else:
+                yield from self._handle_full_articles(extr)
+
+            match = self._find_pager_url(page)
+            url = text.unescape(match.group(1)) if match else None
+            query = None
+
+    def _handle_partial_articles(self, extr):
+        while True:
+            section = extr('<section class="archive-entry', '</section>')
+            if not section:
+                break
+
+            url = "hatenablog:" + text.unescape(text.extr(
+                section, '<a class="entry-title-link" href="', '"'))
+            data = {"_extractor": HatenablogEntryExtractor}
+            yield Message.Queue, url, data
+
+    def _handle_full_articles(self, extr):
+        while True:
+            attributes = extr('<article ', '>')
+            if not attributes:
+                break
+            if "no-entry" in attributes:
+                continue
+
+            article = extr('', '</article>')
+            yield from self._handle_article(article)
+
+    def _acceptable_query(self, key):
+        return key == "page" or key in self.allowed_parameters
+
+
+class HatenablogEntryExtractor(HatenablogExtractor):
+    """Extractor for a single entry URL"""
+    subcategory = "entry"
+    pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
+    example = "https://BLOG.hatenablog.com/entry/PATH"
+
+    def __init__(self, match):
+        HatenablogExtractor.__init__(self, match)
+        self.path = match.group(3)
+
+    def items(self):
+        url = "https://" + self.domain + "/entry/" + self.path
+        page = self.request(url).text
+
+        extr = text.extract_from(page)
+        while True:
+            attributes = extr('<article ', '>')
+            if "no-entry" in attributes:
+                continue
+            article = extr('', '</article>')
+            return self._handle_article(article)
+
+
+class HatenablogHomeExtractor(HatenablogEntriesExtractor):
+    """Extractor for a blog's home page"""
+    subcategory = "home"
+    pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
+    example = "https://BLOG.hatenablog.com"
+
+
+class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
+    """Extractor for a blog's archive page"""
+    subcategory = "archive"
+    pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
+               r"|/category/[^?#]+)?)" + QUERY_RE)
+    example = "https://BLOG.hatenablog.com/archive/2024"
+
+
+class HatenablogSearchExtractor(HatenablogEntriesExtractor):
+    """Extractor for a blog's search results"""
+    subcategory = "search"
+    pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
+    example = "https://BLOG.hatenablog.com/search?q=QUERY"
+    allowed_parameters = ("q",)
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -50,6 +50,7 @@ CATEGORY_MAP = {
    "fanbox"         : "pixivFANBOX",
    "fashionnova"    : "Fashion Nova",
    "furaffinity"    : "Fur Affinity",
+    "hatenablog"     : "HatenaBlog",
    "hbrowse"        : "HBrowse",
    "hentai2read"    : "Hentai2Read",
    "hentaicosplays" : "Hentai Cosplay",
@ -102,7 +103,6 @@ CATEGORY_MAP = {
    "pornimagesxxx"  : "Porn Image",
    "pornpics"       : "PornPics.com",
    "pornreactor"    : "PornReactor",
-    "postmill"       : "Postmill",
    "readcomiconline": "Read Comic Online",
    "rbt"            : "RebeccaBlackTech",
    "redgifs"        : "RedGIFs",
@ -189,6 +189,11 @@ SUBCATEGORY_MAP = {
    "fapello": {
        "path": "Videos, Trending Posts, Popular Videos, Top Models",
    },
+    "hatenablog": {
+        "archive": "Archive",
+        "entry"  : "Individual Posts",
+        "home"   : "Home Feed",
+    },
    "hentaifoundry": {
        "story": "",
    },
--- a/test/results/hatenablog.py
+++ b/test/results/hatenablog.py
@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import hatenablog
+
+
+__tests__ = (
+{
+    "#url"     : "https://cosmiclatte.hatenablog.com/entry/2020/05/28/003227",
+    "#category": ("", "hatenablog", "entry"),
+    "#class"   : hatenablog.HatenablogEntryExtractor,
+    "#count"   : 20,
+},
+
+{
+    "#url"     : "https://moko0908.hatenablog.jp/entry/2023/12/31/083846",
+    "#category": ("", "hatenablog", "entry"),
+    "#class"   : hatenablog.HatenablogEntryExtractor,
+},
+
+{
+    "#url"     : "https://p-shirokuma.hatenadiary.com/entry/20231227/1703685600",
+    "#category": ("", "hatenablog", "entry"),
+    "#class"   : hatenablog.HatenablogEntryExtractor,
+},
+
+{
+    "#url"     : "https://urakatahero.hateblo.jp/entry/2ndlife",
+    "#category": ("", "hatenablog", "entry"),
+    "#class"   : hatenablog.HatenablogEntryExtractor,
+},
+
+{
+    "#url"     : "hatenablog:https://blog.hyouhon.com/entry/2023/12/22/133549",
+    "#category": ("", "hatenablog", "entry"),
+    "#class"   : hatenablog.HatenablogEntryExtractor,
+},
+
+{
+    "#url"     : "https://cetriolo.hatenablog.com",
+    "#category": ("", "hatenablog", "home"),
+    "#class"   : hatenablog.HatenablogHomeExtractor,
+    "#range"   : "1-7",
+    "#count"   : 7,
+},
+
+{
+    "#url"     : "https://moko0908.hatenablog.jp/",
+    "#category": ("", "hatenablog", "home"),
+    "#class"   : hatenablog.HatenablogHomeExtractor,
+},
+
+{
+    "#url"     : "https://p-shirokuma.hatenadiary.com/",
+    "#category": ("", "hatenablog", "home"),
+    "#class"   : hatenablog.HatenablogHomeExtractor,
+},
+
+{
+    "#url"     : "https://urakatahero.hateblo.jp/",
+    "#category": ("", "hatenablog", "home"),
+    "#class"   : hatenablog.HatenablogHomeExtractor,
+},
+
+{
+    "#url"     : "hatenablog:https://blog.hyouhon.com/",
+    "#category": ("", "hatenablog", "home"),
+    "#class"   : hatenablog.HatenablogHomeExtractor,
+},
+
+{
+    "#url"     : ("https://8saki.hatenablog.com/archive/category/%E3%82%BB%E3"
+                  "%83%AB%E3%83%95%E3%82%B8%E3%82%A7%E3%83%AB%E3%83%8D%E3%82"
+                  "%A4%E3%83%AB"),
+    "#category": ("", "hatenablog", "archive"),
+    "#class"   : hatenablog.HatenablogArchiveExtractor,
+    "#range"   : "1-30",
+    "#count"   : 30,
+},
+
+{
+    "#url"     : "https://moko0908.hatenablog.jp/archive/2023",
+    "#category": ("", "hatenablog", "archive"),
+    "#class"   : hatenablog.HatenablogArchiveExtractor,
+    "#count"   : 13,
+},
+
+{
+    "#url"     : "https://p-shirokuma.hatenadiary.com/archive/2023/01",
+    "#category": ("", "hatenablog", "archive"),
+    "#class"   : hatenablog.HatenablogArchiveExtractor,
+    "#count"   : 5,
+},
+
+{
+    "#url"     : "https://urakatahero.hateblo.jp/archive",
+    "#category": ("", "hatenablog", "archive"),
+    "#class"   : hatenablog.HatenablogArchiveExtractor,
+    "#range"   : "1-30",
+    "#count"   : 30,
+},
+
+{
+    "#url"     : "hatenablog:https://blog.hyouhon.com/archive/2024/01/01",
+    "#category": ("", "hatenablog", "archive"),
+    "#class"   : hatenablog.HatenablogArchiveExtractor,
+},
+
+{
+    "#url"     : "hatenablog:https://blog.hyouhon.com/search?q=a",
+    "#category": ("", "hatenablog", "search"),
+    "#class"   : hatenablog.HatenablogSearchExtractor,
+    "#range"   : "1-30",
+    "#count"   : 30,
+},
+
+{
+    "#url"     : "https://cosmiclatte.hatenablog.com/search?q=a",
+    "#category": ("", "hatenablog", "search"),
+    "#class"   : hatenablog.HatenablogSearchExtractor,
+},
+
+{
+    "#url"     : "https://moko0908.hatenablog.jp/search?q=a",
+    "#category": ("", "hatenablog", "search"),
+    "#class"   : hatenablog.HatenablogSearchExtractor,
+},
+
+{
+    "#url"     : "https://p-shirokuma.hatenadiary.com/search?q=a",
+    "#category": ("", "hatenablog", "search"),
+    "#class"   : hatenablog.HatenablogSearchExtractor,
+},
+
+{
+    "#url"     : "https://urakatahero.hateblo.jp/search?q=a",
+    "#category": ("", "hatenablog", "search"),
+    "#class"   : hatenablog.HatenablogSearchExtractor,
+},
+
+)