diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index eb411fc9..8393cc51 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -207,16 +207,17 @@ class ChapterExtractor(Extractor): "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") archive_fmt = ( "{manga}_{chapter}{chapter_minor}_{page}") + root = "" - def __init__(self, match, url): + def __init__(self, match, url=None): Extractor.__init__(self, match) - self.url = url + self.chapter_url = url or self.root + match.group(1) def items(self): self.login() - page = self.request(self.url).text - data = self.get_metadata(page) - imgs = self.get_images(page) + page = self.request(self.chapter_url).text + data = self.metadata(page) + imgs = self.images(page) if "count" in data: images = zip( @@ -240,10 +241,10 @@ class ChapterExtractor(Extractor): def login(self): """Login and set necessary cookies""" - def get_metadata(self, page): + def metadata(self, page): """Return a dict with general metadata""" - def get_images(self, page): + def images(self, page): """Return a list of all (image-url, metadata)-tuples""" @@ -251,19 +252,19 @@ class MangaExtractor(Extractor): subcategory = "manga" categorytransfer = True - scheme = "http" - root = "" reverse = True + root = "" def __init__(self, match, url=None): Extractor.__init__(self, match) - self.url = url or self.scheme + "://" + match.group(1) + self.manga_url = url or self.root + match.group(1) if self.config("chapter-reverse", False): self.reverse = not self.reverse def items(self): - page = self.request(self.url).text + self.login() + page = self.request(self.manga_url).text chapters = self.chapters(page) if self.reverse: @@ -273,6 +274,9 @@ class MangaExtractor(Extractor): for chapter, data in chapters: yield Message.Queue, chapter, data + def login(self): + """Login and set necessary cookies""" + def chapters(self, page): """Return a list of all (chapter-url, metadata)-tuples""" diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 3600515c..2dad0cca 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -10,14 +10,14 @@ from .common import ChapterExtractor from .. import text -import re import json +import re class DynastyscansChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from dynasty-scans.com""" category = "dynastyscans" - pattern = r"(?:https?://)?(?:www\.)?dynasty-scans\.com/chapters/([^/]+)" + pattern = r"(?:https?://)?(?:www\.)?dynasty-scans\.com(/chapters/[^/?&#]+)" test = ( (("http://dynasty-scans.com/chapters/" "hitoribocchi_no_oo_seikatsu_ch33"), { @@ -32,13 +32,7 @@ class DynastyscansChapterExtractor(ChapterExtractor): ) root = "https://dynasty-scans.com" - def __init__(self, match): - self.chaptername = match.group(1) - url = self.root + "/chapters/" + self.chaptername - ChapterExtractor.__init__(self, match, url) - - def get_metadata(self, page): - """Collect metadata for extractor-job""" + def metadata(self, page): info , pos = text.extract(page, "

", "") author, pos = text.extract(page, " by ", "", pos) group , pos = text.extract(page, '"icon-print"> ', '', pos) @@ -64,8 +58,7 @@ class DynastyscansChapterExtractor(ChapterExtractor): "language": "English", } - def get_images(self, page): - """Extract list of all image-urls for a manga chapter""" + def images(self, page): data = text.extract(page, "var pages = ", ";\n")[0] return [ (self.root + img["image"], None) diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index 62d2caca..b7e209e1 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -38,7 +38,7 @@ class FallenangelsChapterExtractor(ChapterExtractor): self.version, self.manga, self.chapter) ChapterExtractor.__init__(self, match, url) - def get_metadata(self, page): + def metadata(self, page): lang = "vi" if self.version == "truyen" else "en" data = { "chapter": self.chapter, @@ -52,7 +52,7 @@ class FallenangelsChapterExtractor(ChapterExtractor): ), values=data)[0] @staticmethod - def get_images(page): + def images(page): return [ (img["page_image"], None) for img in json.loads( @@ -65,7 +65,6 @@ class FallenangelsMangaExtractor(MangaExtractor): """Extractor for manga from fascans.com""" category = "fallenangels" pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" - scheme = "https" test = ( ("http://manga.fascans.com/manga/trinity-seven", { "url": "92699a250ff7d5adcf4b06e6a45b0c05f3426643", @@ -78,8 +77,9 @@ class FallenangelsMangaExtractor(MangaExtractor): ) def __init__(self, match): - MangaExtractor.__init__(self, match) + url = "https://" + match.group(1) self.lang = "vi" if match.group(2) == "truyen" else "en" + MangaExtractor.__init__(self, match, url) def chapters(self, page): language = util.code_to_language(self.lang) diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 84fdbe75..5249e1fe 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -8,7 +8,8 @@ """Extractors for FoOlSlide based sites""" -from .common import Extractor, MangaExtractor, Message, SharedConfigMixin +from .common import ( + Extractor, ChapterExtractor, MangaExtractor, Message, SharedConfigMixin) from .. import text, util, config import base64 import json @@ -35,24 +36,17 @@ class FoolslideBase(SharedConfigMixin): return data -class FoolslideChapterExtractor(FoolslideBase, Extractor): +class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): """Base class for chapter extractors for FoOlSlide based sites""" - subcategory = "chapter" directory_fmt = ( "{category}", "{manga}", "{chapter_string}") - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") archive_fmt = "{id}" decode = "default" - def __init__(self, match): - Extractor.__init__(self, match) - self.url = self.root + match.group(1) - def items(self): - page = self.request(self.url).text - data = self.get_metadata(page) - imgs = self.get_images(page) + page = self.request(self.chapter_url).text + data = self.metadata(page) + imgs = self.images(page) data["count"] = len(imgs) data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"]) @@ -73,20 +67,18 @@ class FoolslideChapterExtractor(FoolslideBase, Extractor): text.nameext_from_url(data["filename"], data) yield Message.Url, url, data - def get_metadata(self, page): - """Collect metadata for extractor-job""" + def metadata(self, page): _ , pos = text.extract(page, '

', '') manga , pos = text.extract(page, 'title="', '"', pos) chapter, pos = text.extract(page, 'title="', '"', pos) chapter = text.unescape(chapter) - return self.parse_chapter_url(self.url, { + return self.parse_chapter_url(self.chapter_url, { "manga": text.unescape(manga).strip(), "title": chapter.partition(":")[2].strip(), "chapter_string": chapter, }) - def get_images(self, page): - """Return a list of all images in this chapter""" + def images(self, page): if self.decode == "base64": base64_data = text.extract(page, 'atob("', '"')[0].encode() data = base64.b64decode(base64_data).decode() @@ -101,11 +93,7 @@ class FoolslideChapterExtractor(FoolslideBase, Extractor): class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): """Base class for manga extractors for FoOlSlide based sites""" - def __init__(self, match): - MangaExtractor.__init__(self, match, self.root + match.group(1)) - def chapters(self, page): - """Return a list of all chapter urls""" manga , pos = text.extract(page, '

', '

') author, pos = text.extract(page, 'Author: ', 'Artist: ', '05}") + filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" + "{page:>03}.{extension}") + archive_fmt = "{manga_id}_{chapter}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" + test = ("https://www.hbrowse.com/10363/c00000", { + "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", + "keyword": "95ec73a58aeac57f4dd20f0fa0c2812b045a30e8", + "content": "44578ebbe176c2c27434966aef22945787e2781e", + }) + + def __init__(self, match): + self.path, self.gid, self.chapter = match.groups() + self.path += "/" + ChapterExtractor.__init__(self, match) + + def metadata(self, page): + return self.parse_page(page, { + "manga_id": text.parse_int(self.gid), + "chapter": text.parse_int(self.chapter) + }) + + def images(self, page): + base = self.root + "/data" + self.path + json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" + return [(base + name, None) for name in json.loads(json_data)] + + class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): """Extractor for manga from hbrowse.com""" - pattern = r"(?:https?://)?((?:www\.)?hbrowse\.com/\d+)/?$" + pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" reverse = False test = ("https://www.hbrowse.com/10363", { "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6", @@ -55,7 +85,7 @@ class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): results = [] data = self.parse_page(page, { "manga_id": text.parse_int( - self.url.rstrip("/").rpartition("/")[2]) + self.manga_url.rstrip("/").rpartition("/")[2]) }) pos = 0 @@ -68,33 +98,3 @@ class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): data["chapter"] = text.parse_int(url.rpartition("/")[2][1:]) data["title"] = title results.append((text.urljoin(self.root, url), data.copy())) - - -class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): - """Extractor for manga-chapters from hbrowse.com""" - directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}") - filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" - "{page:>03}.{extension}") - archive_fmt = "{manga_id}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)" - test = ("https://www.hbrowse.com/10363/c00000", { - "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", - "keyword": "95ec73a58aeac57f4dd20f0fa0c2812b045a30e8", - "content": "44578ebbe176c2c27434966aef22945787e2781e", - }) - - def __init__(self, match): - self.gid, self.chapter = match.groups() - self.path = "/{}/c{}/".format(self.gid, self.chapter) - ChapterExtractor.__init__(self, match, self.root + self.path) - - def get_metadata(self, page): - return self.parse_page(page, { - "manga_id": text.parse_int(self.gid), - "chapter": text.parse_int(self.chapter) - }) - - def get_images(self, page): - base = self.root + "/data" + self.path - json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" - return [(base + name, None) for name in json.loads(json_data)] diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 06f977ac..e93c02ae 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -10,21 +10,65 @@ from .common import ChapterExtractor, MangaExtractor from .. import text -import re import json +import re -class Hentai2readMangaExtractor(MangaExtractor): - """Extractor for hmanga from hentai2read.com""" +class Hentai2readBase(): + """Base class for hentai2read extractors""" category = "hentai2read" - scheme = "https" - pattern = r"(?:https?://)?(?:www\.)?(hentai2read\.com/[^/]+/?)$" + root = "https://hentai2read.com" + + +class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): + """Extractor for a single manga chapter from hentai2read.com""" + archive_fmt = "{chapter_id}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))" + test = ("https://hentai2read.com/amazon_elixir/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "9845105898d28c6a540cffdea60a1a20fab52431", + }) + + def __init__(self, match): + self.chapter = match.group(2) + ChapterExtractor.__init__(self, match) + + def metadata(self, page): + title, pos = text.extract(page, "", "") + manga_id, pos = text.extract(page, 'data-mid="', '"', pos) + chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) + match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " + r"(\d+): (.+) . Page 1 ", title) + return { + "manga": match.group(1), + "manga_id": text.parse_int(manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), + "type": match.group(2), + "author": match.group(3), + "title": match.group(5), + "lang": "en", + "language": "English", + } + + @staticmethod + def images(page): + images = text.extract(page, "'images' : ", ",\n")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] + + +class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor): + """Extractor for hmanga from hentai2read.com""" + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$" test = ( - ("http://hentai2read.com/amazon_elixir/", { + ("https://hentai2read.com/amazon_elixir/", { "url": "273073752d418ec887d7f7211e42b832e8c403ba", "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", }), - ("http://hentai2read.com/oshikage_riot/", { + ("https://hentai2read.com/oshikage_riot/", { "url": "6595f920a3088a15c2819c502862d45f8eb6bea6", "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", }), @@ -54,46 +98,3 @@ class Hentai2readMangaExtractor(MangaExtractor): "chapter": text.parse_int(chapter), "title": title, "lang": "en", "language": "English", })) - - -class Hentai2readChapterExtractor(ChapterExtractor): - """Extractor for a single manga chapter from hentai2read.com""" - category = "hentai2read" - archive_fmt = "{chapter_id}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)" - test = ("http://hentai2read.com/amazon_elixir/1/", { - "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", - "keyword": "9845105898d28c6a540cffdea60a1a20fab52431", - }) - - def __init__(self, match): - url_title, self.chapter = match.groups() - url = "https://hentai2read.com/{}/{}/".format(url_title, self.chapter) - ChapterExtractor.__init__(self, match, url) - - def get_metadata(self, page): - title, pos = text.extract(page, "", "") - manga_id, pos = text.extract(page, 'data-mid="', '"', pos) - chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) - match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " - r"(\d+): (.+) . Page 1 ", title) - return { - "manga": match.group(1), - "manga_id": text.parse_int(manga_id), - "chapter": text.parse_int(self.chapter), - "chapter_id": text.parse_int(chapter_id), - "type": match.group(2), - "author": match.group(3), - "title": match.group(5), - "lang": "en", - "language": "English", - } - - @staticmethod - def get_images(page): - """Extract and return a list of all image-urls""" - images = text.extract(page, "'images' : ", ",\n")[0] - return [ - ("https://hentaicdn.com/hentai" + part, None) - for part in json.loads(images) - ] diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py index 93b19b32..e290f609 100644 --- a/gallery_dl/extractor/hentaicafe.py +++ b/gallery_dl/extractor/hentaicafe.py @@ -25,10 +25,10 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): }) root = "https://hentai.cafe" - def get_metadata(self, page): + def metadata(self, page): info = text.unescape(text.extract(page, '', '')[0]) manga, _, chapter_string = info.partition(" :: ") - return self.parse_chapter_url(self.url, { + return self.parse_chapter_url(self.chapter_url, { "manga": manga, "chapter_string": chapter_string.rstrip(" :"), }) @@ -58,7 +58,7 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): reverse = False def chapters(self, page): - if "/manga/series/" in self.url: + if "/manga/series/" in self.manga_url: chapters = foolslide.FoolslideMangaExtractor.chapters(self, page) chapters.reverse() return chapters diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index e5cf5650..15ca7e14 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -19,7 +19,7 @@ class HentaifoxGalleryExtractor(ChapterExtractor): filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" directory_fmt = ("{category}", "{gallery_id} {title}") archive_fmt = "{gallery_id}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com/gallery/(\d+)" + pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" test = ("https://hentaifox.com/gallery/56622/", { "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", "count": 24, @@ -28,11 +28,10 @@ class HentaifoxGalleryExtractor(ChapterExtractor): root = "https://hentaifox.com" def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/gallery/{}".format(self.root, self.gallery_id) - ChapterExtractor.__init__(self, match, url) + ChapterExtractor.__init__(self, match) + self.gallery_id = match.group(2) - def get_metadata(self, page): + def metadata(self, page): title, pos = text.extract(page, "

", "

") data = text.extract_all(page, ( ("parodies" , ">Parodies:" , ""), @@ -51,7 +50,7 @@ class HentaifoxGalleryExtractor(ChapterExtractor): data["lang"] = "en" return data - def get_images(self, page): + def images(self, page): return [ (text.urljoin(self.root, url.replace("t.", ".")), None) for url in text.extract_iter(page, 'data-src="', '"') diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index 0a63baef..1e504042 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -14,11 +14,55 @@ import json import re -class HentaihereMangaExtractor(MangaExtractor): - """Extractor for hmanga from hentaihere.com""" +class HentaihereBase(): + """Base class for hentaihere extractors""" category = "hentaihere" - pattern = r"(?:https?://)?(?:www\.)?(hentaihere\.com/m/S\d+)/?$" - scheme = "https" + root = "https://hentaihere.com" + + +class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): + """Extractor for a single manga chapter from hentaihere.com""" + archive_fmt = "{chapter_id}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)" + test = ("https://hentaihere.com/m/S13812/1/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "e9382a9be337abce3db2b1132e85751379dc05c5", + }) + + def __init__(self, match): + self.manga_id, self.chapter = match.groups() + url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + title = text.extract(page, "", "")[0] + chapter_id = text.extract(page, 'report/C', '"')[0] + pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " + match = re.match(pattern, title) + return { + "manga": match.group(1), + "manga_id": text.parse_int(self.manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), + "type": match.group(2), + "title": match.group(3), + "author": match.group(4), + "lang": "en", + "language": "English", + } + + @staticmethod + def images(page): + images = text.extract(page, "var rff_imageList = ", ";")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] + + +class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): + """Extractor for hmanga from hentaihere.com""" + pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com(/m/S\d+)/?$" test = ( ("https://hentaihere.com/m/S13812", { "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559", @@ -33,7 +77,7 @@ class HentaihereMangaExtractor(MangaExtractor): def chapters(self, page): results = [] manga_id = text.parse_int( - self.url.rstrip("/").rpartition("/")[2][1:]) + self.manga_url.rstrip("/").rpartition("/")[2][1:]) manga, pos = text.extract( page, '', '') mtype, pos = text.extract( @@ -54,45 +98,3 @@ class HentaihereMangaExtractor(MangaExtractor): "chapter": text.parse_int(chapter), "title": title, "lang": "en", "language": "English", })) - - -class HentaihereChapterExtractor(ChapterExtractor): - """Extractor for a single manga chapter from hentaihere.com""" - category = "hentaihere" - archive_fmt = "{chapter_id}_{page}" - pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)" - test = ("https://hentaihere.com/m/S13812/1/1/", { - "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", - "keyword": "e9382a9be337abce3db2b1132e85751379dc05c5", - }) - - def __init__(self, match): - self.manga_id, self.chapter = match.groups() - url = "https://hentaihere.com/m/S{}/{}/1".format( - self.manga_id, self.chapter) - ChapterExtractor.__init__(self, match, url) - - def get_metadata(self, page): - title = text.extract(page, "", "")[0] - chapter_id = text.extract(page, 'report/C', '"')[0] - pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " - match = re.match(pattern, title) - return { - "manga": match.group(1), - "manga_id": text.parse_int(self.manga_id), - "chapter": text.parse_int(self.chapter), - "chapter_id": text.parse_int(chapter_id), - "type": match.group(2), - "title": match.group(3), - "author": match.group(4), - "lang": "en", - "language": "English", - } - - @staticmethod - def get_images(page): - images = text.extract(page, "var rff_imageList = ", ";")[0] - return [ - ("https://hentaicdn.com/hentai" + part, None) - for part in json.loads(images) - ] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 505c0d6e..a581daa0 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -38,8 +38,9 @@ class HitomiGalleryExtractor(ChapterExtractor): url = "https://hitomi.la/galleries/{}.html".format(self.gid) ChapterExtractor.__init__(self, match, url) - def get_metadata(self, page, extr=text.extract): + def metadata(self, page): pos = page.index('

', '<', pos) artist, pos = extr(page, '

', '

', pos) group , pos = extr(page, 'Group', '', pos) @@ -65,7 +66,7 @@ class HitomiGalleryExtractor(ChapterExtractor): "tags": self._prepare(tags), } - def get_images(self, page): + def images(self, page): # see https://ltn.hitomi.la/common.js frontends = 2 offset = self.gid % frontends if self.gid % 10 != 1 else 0 diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 27656fd1..fa638ac1 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -67,41 +67,6 @@ class KissmangaBase(): return data -class KissmangaMangaExtractor(KissmangaBase, MangaExtractor): - """Extractor for manga from kissmanga.com""" - pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" - r"(/Manga/[^/?&#]+/?)$") - test = ( - ("https://kissmanga.com/Manga/Dropout", { - "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532", - "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d", - }), - ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase - ) - - def __init__(self, match): - MangaExtractor.__init__(self, match, self.root + match.group(1)) - - def chapters(self, page): - results = [] - manga, pos = text.extract(page, ' class="barTitle">', '\ninformation') - page , pos = text.extract(page, ' class="listing">', '', pos) - manga = manga.strip() - needle = '" title="Read ' + manga + ' ' - manga = text.unescape(manga) - - for item in text.extract_iter(page, '
'): - url, _, chapter = item.partition(needle) - data = { - "manga": manga, "chapter_string": chapter, - "chapter_id": text.parse_int(url.rpartition("=")[2]), - "lang": "en", "language": "English", - } - self.parse_chapter_string(data) - results.append((self.root + url, data)) - return results - - class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): """Extractor for manga-chapters from kissmanga.com""" pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" @@ -127,11 +92,11 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): ) def __init__(self, match): - ChapterExtractor.__init__(self, match, self.root + match.group(1)) + ChapterExtractor.__init__(self, match) self.chapter_id = match.group(2) self.session.headers["Referer"] = self.root - def get_metadata(self, page): + def metadata(self, page): title = text.extract(page, "", "")[0].strip() manga, cinfo = title.split("\n")[1:3] data = { @@ -143,7 +108,7 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): } return self.parse_chapter_string(data) - def get_images(self, page): + def images(self, page): self.session.headers["Referer"] = None try: key = self.build_aes_key(page) @@ -211,3 +176,35 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): pos = script.index(var) lst = text.extract(script, "=", ";", pos)[0] return ast.literal_eval(lst.strip())[int(idx)] + + +class KissmangaMangaExtractor(KissmangaBase, MangaExtractor): + """Extractor for manga from kissmanga.com""" + pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" + r"(/Manga/[^/?&#]+/?)$") + test = ( + ("https://kissmanga.com/Manga/Dropout", { + "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532", + "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d", + }), + ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase + ) + + def chapters(self, page): + results = [] + manga, pos = text.extract(page, ' class="barTitle">', '\ninformation') + page , pos = text.extract(page, ' class="listing">', '', pos) + manga = manga.strip() + needle = '" title="Read ' + manga + ' ' + manga = text.unescape(manga) + + for item in text.extract_iter(page, ''): + url, _, chapter = item.partition(needle) + data = { + "manga": manga, "chapter_string": chapter, + "chapter_id": text.parse_int(url.rpartition("=")[2]), + "lang": "en", "language": "English", + } + self.parse_chapter_string(data) + results.append((self.root + url, data)) + return results diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 30510278..fc6d0495 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -16,7 +16,6 @@ import re class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - scheme = "https" root = "https://komikcast.com" request = cloudflare.request_func @@ -62,15 +61,12 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): }), ) - def __init__(self, match): - ChapterExtractor.__init__(self, match, self.root + match.group(1)) - - def get_metadata(self, page): + def metadata(self, page): info = text.extract(page, '', "")[0] return self.parse_chapter_string(info) @staticmethod - def get_images(page): + def images(page): readerarea = text.extract( page, '
', '

"), + ("title" , "", "<"), + ), values={"lang": "en", "language": "English"})[0] + + if not data["path"]: + raise exception.NotFoundError("chapter") + self.parse_chapter_path(data["path"], data) + + data["manga"], _, data["type"] = data["manga"].rpartition(" ") + data["manga"] = text.unescape(data["manga"]) + data["title"] = data["title"].partition(": ")[2] + for key in ("manga_id", "chapter_id", "stream"): + data[key] = text.parse_int(data[key]) + + return data + + def images(self, page): + data = json.loads(text.extract( + page, "var _load_pages =", ";")[0] or "[]") + return [ + (text.urljoin(self.root, item["u"]), { + "width": text.parse_int(item["w"]), + "height": text.parse_int(item["h"]), + }) + for item in data + ] + + class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): """Extractor for manga from mangapark.me""" pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" @@ -75,65 +137,3 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): results.append((self.root + path, data.copy())) return results - - -class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): - """Extractor for manga-chapters from mangapark.me""" - pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" - r"/manga/([^?&#]+/i\d+)") - test = ( - ("https://mangapark.me/manga/gosu/i811615/c55/1", { - "count": 50, - "keyword": "a18e07119b3317d7e795ef37ee69ce0bbb806350", - }), - (("https://mangapark.me/manga" - "/ad-astra-per-aspera-hata-kenjirou/i662054/c001.2/1"), { - "count": 40, - "keyword": "3f286631279e2017ce87c1b8db05d7b3f15e2971", - }), - ("https://mangapark.me/manga/gekkan-shoujo-nozaki-kun/i655476/c70/1", { - "count": 15, - "keyword": "3abb13e6d1ea7f8808b0ec415270b3afac97f98b", - }), - ("https://mangapark.net/manga/gosu/i811615/c55/1"), - ("https://mangapark.com/manga/gosu/i811615/c55/1"), - ) - - def __init__(self, match): - tld, self.path = match.groups() - self.root = self.root_fmt.format(tld) - url = "{}/manga/{}?zoom=2".format(self.root, self.path) - ChapterExtractor.__init__(self, match, url) - - def get_metadata(self, page): - data = text.extract_all(page, ( - ("manga_id" , "var _manga_id = '", "'"), - ("chapter_id", "var _book_id = '", "'"), - ("stream" , "var _stream = '", "'"), - ("path" , "var _book_link = '", "'"), - ("manga" , "

", "

"), - ("title" , "", "<"), - ), values={"lang": "en", "language": "English"})[0] - - if not data["path"]: - raise exception.NotFoundError("chapter") - self.parse_chapter_path(data["path"], data) - - data["manga"], _, data["type"] = data["manga"].rpartition(" ") - data["manga"] = text.unescape(data["manga"]) - data["title"] = data["title"].partition(": ")[2] - for key in ("manga_id", "chapter_id", "stream"): - data[key] = text.parse_int(data[key]) - - return data - - def get_images(self, page): - data = json.loads(text.extract( - page, "var _load_pages =", ";")[0] or "[]") - return [ - (text.urljoin(self.root, item["u"]), { - "width": text.parse_int(item["w"]), - "height": text.parse_int(item["h"]), - }) - for item in data - ] diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index ed922116..93f087bc 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -32,31 +32,6 @@ class MangareaderBase(): return data -class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): - """Extractor for manga from mangareader.net""" - pattern = r"(?:https?://)?((?:www\.)?mangareader\.net/[^/?&#]+)/?$" - reverse = False - test = ("https://www.mangareader.net/mushishi", { - "url": "bc203b858b4ad76e5d77e39118a7be0350e357da", - "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", - }) - - def chapters(self, page): - results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) - - needle = '
\n') - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - data["title"], pos = text.extract(page, ' : ', '', pos) - data["date"] , pos = text.extract(page, '', '', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2]) - results.append((self.root + url, data.copy())) - - class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): """Extractor for manga-chapters from mangareader.net""" archive_fmt = "{manga}_{chapter}_{page}" @@ -68,11 +43,10 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): }) def __init__(self, match): - self.part, self.url_title, self.chapter = match.groups() - ChapterExtractor.__init__(self, match, self.root + self.part) + path, self.url_title, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, self.root + path) - def get_metadata(self, chapter_page): - """Collect metadata for extractor-job""" + def metadata(self, chapter_page): page = self.request(self.root + self.url_title).text data = self.parse_page(page, { "chapter": text.parse_int(self.chapter), @@ -88,7 +62,7 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): ) return data - def get_images(self, page): + def images(self, page): while True: next_url, image_url, image_data = self.get_image_metadata(page) yield image_url, image_data @@ -117,3 +91,28 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): "width": text.parse_int(width), "height": text.parse_int(height), } + + +class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): + """Extractor for manga from mangareader.net""" + pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?&#]+)/?$" + reverse = False + test = ("https://www.mangareader.net/mushishi", { + "url": "bc203b858b4ad76e5d77e39118a7be0350e357da", + "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", + }) + + def chapters(self, page): + results = [] + data = self.parse_page(page, {"lang": "en", "language": "English"}) + + needle = '
\n') + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + data["title"], pos = text.extract(page, ' : ', '', pos) + data["date"] , pos = text.extract(page, '', '', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2]) + results.append((self.root + url, data.copy())) diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py index 57a878fc..7ff0239c 100644 --- a/gallery_dl/extractor/mangastream.py +++ b/gallery_dl/extractor/mangastream.py @@ -16,8 +16,8 @@ class MangastreamChapterExtractor(ChapterExtractor): """Extractor for manga-chapters from mangastream.com""" category = "mangastream" archive_fmt = "{chapter_id}_{page}" - pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)/" - r"r(?:ead)?/([^/]*/([^/]+)/(\d+))") + pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)" + r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))") test = ( ("https://readms.net/r/onepunch_man/087/4874/1"), ("https://mangastream.com/r/onepunch_man/087/4874/1"), @@ -29,7 +29,7 @@ class MangastreamChapterExtractor(ChapterExtractor): url = "{}/r/{}".format(self.root, self.part) ChapterExtractor.__init__(self, match, url) - def get_metadata(self, page): + def metadata(self, page): manga, pos = text.extract( page, '