diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 9c8b410a..4bb124d7 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -86,7 +86,8 @@ Reddit https://www.reddit.com/ individual Images, Subm rule #34 https://rule34.paheal.net/ Posts, Tag-Searches Rule 34 https://rule34.xxx/ Pools, Posts, Tag-Searches Safebooru https://safebooru.org/ Pools, Posts, Tag-Searches -Sankaku Channel https://chan.sankakucomplex.com/ Articles, Pools, Posts, Tag-Searches Optional +Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional +Sankaku Complex https://www.sankakucomplex.com/ Articles Sen Manga https://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/reader/ Chapters, Manga Sex.com https://www.sex.com/ Boards, Pins, Search Results diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 0cbf6252..3ee6d3db 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -78,6 +78,7 @@ modules = [ "rule34", "safebooru", "sankaku", + "sankakucomplex", "seiga", "senmanga", "sexcom", diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index a1e7219d..f67911b3 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://chan.sankakucomplex.com/""" +"""Extractors for https://chan.sankakucomplex.com/""" from .common import Extractor, Message, SharedConfigMixin from .. import text, util, exception @@ -297,72 +297,3 @@ class SankakuPostExtractor(SankakuExtractor): def get_posts(self): return (self.post_id,) - - -class SankakuArticleExtractor(Extractor): - """Extractor for articles on www.sankakucomplex.com""" - category = "sankaku" - subcategory = "article" - directory_fmt = ("{category}", "Articles", "{date:%Y-%m-%d} {title}") - filename_fmt = "{filename}.{extension}" - archive_fmt = "a_{date:%Y%m%d}_{filename}" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" - r"/(\d{4}/\d\d/\d\d)/([^/?&#]+)") - test = ( - ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", { - "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d", - "keyword": "4ab96f31df9ee95d0dc6eefc2ca4e508c45c8e00", - }), - ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", { - "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", - "keyword": "a7876de642bf3e68fb4743dcd4d4e8778f2c17ab", - }), - ) - root = "https://www.sankakucomplex.com" - - def __init__(self, match): - Extractor.__init__(self, match) - self.date, self.title = match.groups() - - def items(self): - url = "{}/{}/{}/?pg=X".format(self.root, self.date, self.title) - extr = text.extract_from(self.request(url).text) - data = { - "title" : text.unescape( - extr('"og:title" content="', '"')), - "description": text.unescape( - extr('"og:description" content="', '"')), - "date" : text.parse_datetime( - extr('"og:updated_time" content="', '"')), - } - imgs = self.images(extr) - data["count"] = len(imgs) - data["tags"] = text.split_html(extr('="meta-tags">', ''))[::2] - - yield Message.Directory, data - for img in imgs: - img.update(data) - yield Message.Url, img["url"], img - - def images(self, extr): - num = 0 - imgs = [] - urls = set() - orig = re.compile(r"-\d+x\d+\.") - - extr('
', '') - while True: - url = extr('data-lazy-src="', '"') - if not url: - return imgs - if url in urls: - continue - if url[0] == "/": - url = text.urljoin(self.root, url) - url = orig.sub(".", url) - num += 1 - imgs.append(text.nameext_from_url(url, { - "url" : url, - "num" : num, - })) - urls.add(url) diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py new file mode 100644 index 00000000..3d7c1dfb --- /dev/null +++ b/gallery_dl/extractor/sankakucomplex.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.sankakucomplex.com/""" + +from .common import Extractor, Message +from .. import text +import re + + +class SankakucomplexExtractor(Extractor): + """Base class for sankakucomplex extractors""" + category = "sankakucomplex" + root = "https://www.sankakucomplex.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + +class SankakucomplexArticleExtractor(SankakucomplexExtractor): + """Extractor for articles on www.sankakucomplex.com""" + subcategory = "article" + directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{date:%Y%m%d}_{filename}" + pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + r"/(\d{4}/\d\d/\d\d/[^/?&#]+)") + test = ( + ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", { + "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d", + "keyword": "35cd2a0aba712d6b0e27a9fa2a5e823199d10ca0", + }), + ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", { + "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", + "keyword": "8bf60e62fb5e9f2caabb29c16ed58d7e0dcf247f", + }), + ) + + def items(self): + url = "{}/{}/?pg=X".format(self.root, self.path) + extr = text.extract_from(self.request(url).text) + data = { + "title" : text.unescape( + extr('"og:title" content="', '"')), + "description": text.unescape( + extr('"og:description" content="', '"')), + "date" : text.parse_datetime( + extr('"og:updated_time" content="', '"')), + } + imgs = self.images(extr) + data["count"] = len(imgs) + data["tags"] = text.split_html(extr('="meta-tags">', '
'))[::2] + + yield Message.Directory, data + for img in imgs: + img.update(data) + yield Message.Url, img["url"], img + + def images(self, extr): + num = 0 + imgs = [] + urls = set() + orig = re.compile(r"-\d+x\d+\.") + + extr('
', '') + while True: + url = extr('data-lazy-src="', '"') + if not url: + return imgs + if url in urls: + continue + if url[0] == "/": + url = text.urljoin(self.root, url) + url = orig.sub(".", url) + num += 1 + imgs.append(text.nameext_from_url(url, { + "url" : url, + "num" : num, + })) + urls.add(url) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 3d103a08..37289996 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -57,6 +57,7 @@ CATEGORY_MAP = { "rbt" : "RebeccaBlackTech", "rule34" : "Rule 34", "sankaku" : "Sankaku Channel", + "sankakucomplex" : "Sankaku Complex", "seaotterscans" : "Sea Otter Scans", "seiga" : "Niconico Seiga", "senmanga" : "Sen Manga",