From 67ac6667af049310f89b109a4d7416a27ccee52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 7 Aug 2020 22:30:10 +0200 Subject: [PATCH] [mangareader] fix extraction --- gallery_dl/extractor/mangapanda.py | 92 +++++++++++++++++++-- gallery_dl/extractor/mangareader.py | 122 +++++++++++----------------- 2 files changed, 136 insertions(+), 78 deletions(-) diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py index 18ef0054..a4b8340f 100644 --- a/gallery_dl/extractor/mangapanda.py +++ b/gallery_dl/extractor/mangapanda.py @@ -1,14 +1,15 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangapanda.com/""" +"""Extractors for https://www.mangapanda.com/""" -from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor +from .common import ChapterExtractor, MangaExtractor +from .. import text class MangapandaBase(): @@ -16,21 +17,102 @@ class MangapandaBase(): category = "mangapanda" root = "https://www.mangapanda.com" + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '

', '

'), + ("release", '>Year of Release:\n', ''), + ('author' , '>Author:\n', ''), + ('artist' , '>Artist:\n', ''), + ), values=data) + data["manga"] = data["manga"].strip() + data["author"] = text.unescape(data["author"]) + data["artist"] = text.unescape(data["artist"]) + return data -class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor): + +class MangapandaChapterExtractor(MangapandaBase, ChapterExtractor): """Extractor for manga-chapters from mangapanda.com""" + archive_fmt = "{manga}_{chapter}_{page}" pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))" test = ("https://www.mangapanda.com/red-storm/2", { "url": "1f633f776e950531ba9b1e81965316458e785261", "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb", }) + def __init__(self, match): + path, self.url_title, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, self.root + path) -class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor): + def metadata(self, chapter_page): + page = self.request(self.root + self.url_title).text + data = self.parse_page(page, { + "chapter": text.parse_int(self.chapter), + "lang": "en", + "language": "English", + }) + text.extract_all(page, ( + ('title', ' ' + self.chapter + ' : ', ''), + ('date', '', ''), + ), page.index('
'), data) + data["count"] = text.parse_int(text.extract( + chapter_page, ' of ', '<')[0] + ) + return data + + def images(self, page): + while True: + next_url, image_url, image_data = self.get_image_metadata(page) + yield image_url, image_data + + if not next_url: + return + page = self.request(next_url).text + + def get_image_metadata(self, page): + """Collect next url, image-url and metadata for one manga-page""" + extr = text.extract + width = None + test , pos = extr(page, "document['pu']", '') + if test is None: + return None, None, None + if page.find("document['imgwidth']", pos, pos+200) != -1: + width , pos = extr(page, "document['imgwidth'] = ", ";", pos) + height, pos = extr(page, "document['imgheight'] = ", ";", pos) + _ , pos = extr(page, '
', '') + url, pos = extr(page, ' href="', '"', pos) + if width is None: + width , pos = extr(page, '') + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + data["title"], pos = text.extract(page, ' : ', '', pos) + data["date"] , pos = text.extract(page, '', '', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2]) + results.append((self.root + url, data.copy())) diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 31083dc8..fd9c7ace 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -6,10 +6,12 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangareader.net/""" +"""Extractors for https://www.mangareader.net/""" from .common import ChapterExtractor, MangaExtractor from .. import text +from ..cache import memcache +import json class MangareaderBase(): @@ -17,19 +19,35 @@ class MangareaderBase(): category = "mangareader" root = "https://www.mangareader.net" - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '

', '

'), - ("release", '>Year of Release:\n', ''), - ('author' , '>Author:\n', ''), - ('artist' , '>Artist:\n', ''), - ), values=data) - data["manga"] = data["manga"].strip() - data["author"] = text.unescape(data["author"]) - data["artist"] = text.unescape(data["artist"]) - return data + @memcache(keyarg=1) + def _manga_info(self, path, page=None): + if not page: + page = self.request(self.root + path).text + extr = text.extract_from(page) + data = { + "manga" : text.unescape(extr('class="name">', '<')), + "release" : text.unescape(extr('Year of Release :', '<')), + "author" : text.unescape(text.unescape(extr( + 'Author :', '<'))), + "artist" : text.unescape(text.unescape(extr( + 'Artist :', '<'))), + "lang" : "en", + "language": "English", + } + + extr('') + chapters = [] + while True: + url = extr(' : ", "<")), + "date" : extr("", "<"), + } + chapter.update(data) + chapters.append((self.root + url, chapter)) class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): @@ -38,59 +56,28 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" test = (("https://www.mangareader.net" "/karate-shoukoushi-kohinata-minoru/11"), { - "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4", + "url": "45ece5668d1e9f65cf2225237d78de58660b54e4", "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", }) def __init__(self, match): - path, self.url_title, self.chapter = match.groups() - ChapterExtractor.__init__(self, match, self.root + path) + ChapterExtractor.__init__(self, match) + _, self.path, self.chapter = match.groups() - def metadata(self, chapter_page): - page = self.request(self.root + self.url_title).text - data = self.parse_page(page, { - "chapter": text.parse_int(self.chapter), - "lang": "en", - "language": "English", - }) - text.extract_all(page, ( - ('title', ' ' + self.chapter + ' : ', ''), - ('date', '', ''), - ), page.index('
'), data) - data["count"] = text.parse_int(text.extract( - chapter_page, ' of ', '<')[0] - ) - return data + def metadata(self, page): + chapter = text.parse_int(self.chapter) + return self._manga_info(self.path)[chapter-1][1] def images(self, page): - while True: - next_url, image_url, image_data = self.get_image_metadata(page) - yield image_url, image_data - - if not next_url: - return - page = self.request(next_url).text - - def get_image_metadata(self, page): - """Collect next url, image-url and metadata for one manga-page""" - extr = text.extract - width = None - test , pos = extr(page, "document['pu']", '') - if test is None: - return None, None, None - if page.find("document['imgwidth']", pos, pos+200) != -1: - width , pos = extr(page, "document['imgwidth'] = ", ";", pos) - height, pos = extr(page, "document['imgheight'] = ", ";", pos) - _ , pos = extr(page, '
', '') - url, pos = extr(page, ' href="', '"', pos) - if width is None: - width , pos = extr(page, '')[0]) + return [ + (text.ensure_http_scheme(img["u"]), { + "width" : text.parse_int(img["w"]), + "height": text.parse_int(img["h"]), + }) + for img in data["im"] + ] class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): @@ -104,16 +91,5 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): }) def chapters(self, page): - results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) - - needle = '
\n') - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - data["title"], pos = text.extract(page, ' : ', '', pos) - data["date"] , pos = text.extract(page, '', '', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2]) - results.append((self.root + url, data.copy())) + path = self.manga_url[len(self.root):] + return self._manga_info(path, page)