# -*- coding: utf-8 -*- # Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for http://blog.livedoor.jp/""" from .common import Extractor, Message from .. import text class LivedoorExtractor(Extractor): """Base class for livedoor extractors""" category = "livedoor" root = "http://blog.livedoor.jp" filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}" directory_fmt = ("{category}", "{post[user]}") archive_fmt = "{post[id]}_{hash}" def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) def items(self): for post in self.posts(): images = self._images(post) if images: yield Message.Directory, {"post": post} for image in images: yield Message.Url, image["url"], image def posts(self): """Return an iterable with post objects""" def _load(self, data, body): extr = text.extract_from(data) tags = text.extract(body, 'class="article-tags">', '')[0] about = extr('rdf:about="', '"') return { "id" : text.parse_int( about.rpartition("/")[2].partition(".")[0]), "title" : text.unescape(extr('dc:title="', '"')), "categories" : extr('dc:subject="', '"').partition(",")[::2], "description": extr('dc:description="', '"'), "date" : text.parse_datetime(extr('dc:date="', '"')), "tags" : text.split_html(tags)[1:] if tags else [], "user" : self.user, "body" : body, } def _images(self, post): imgs = [] body = post.pop("body") for num, img in enumerate(text.extract_iter(body, ""), 1): src = text.extract(img, 'src="', '"')[0] alt = text.extract(img, 'alt="', '"')[0] if not src: continue if "://livedoor.blogimg.jp/" in src: url = src.replace("http:", "https:", 1).replace("-s.", ".") else: url = text.urljoin(self.root, src) name, _, ext = url.rpartition("/")[2].rpartition(".") imgs.append({ "url" : url, "num" : num, "hash" : name, "filename" : alt or name, "extension": ext, "post" : post, }) return imgs class LivedoorBlogExtractor(LivedoorExtractor): """Extractor for a user's blog on blog.livedoor.jp""" subcategory = "blog" pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?#])" test = ( ("http://blog.livedoor.jp/zatsu_ke/", { "range": "1-50", "count": 50, "archive": False, "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+", "keyword": { "post": { "categories" : tuple, "date" : "type:datetime", "description": str, "id" : int, "tags" : list, "title" : str, "user" : "zatsu_ke" }, "filename": str, "hash" : r"re:\w{4,}", "num" : int, }, }), ("http://blog.livedoor.jp/uotapo/", { "range": "1-5", "count": 5, }), ) def posts(self): url = "{}/{}".format(self.root, self.user) while url: extr = text.extract_from(self.request(url).text) while True: data = extr('') if not data: break body = extr('class="article-body-inner">', 'class="article-footer">') yield self._load(data, body) url = extr('') body = extr('class="article-body-inner">', 'class="article-footer">') return (self._load(data, body),)