# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://hatenablog.com"""
import re
from .common import Extractor, Message
from .. import text
BASE_PATTERN = (
r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
r"|hatenadiary\.com|hateblo\.jp)))"
)
QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
class HatenablogExtractor(Extractor):
"""Base class for HatenaBlog extractors"""
category = "hatenablog"
directory_fmt = ("{category}", "{domain}")
filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
archive_fmt = "{filename}"
def __init__(self, match):
Extractor.__init__(self, match)
self.domain = match.group(1) or match.group(2)
def _init(self):
self._find_img = re.compile(r']+)').finditer
def _handle_article(self, article: str):
extr = text.extract_from(article)
date = text.parse_datetime(extr('