1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-23 11:12:40 +01:00
gallery-dl/gallery_dl/extractor/__init__.py

279 lines
4.9 KiB
Python
Raw Normal View History

2015-06-28 22:53:52 +02:00
# -*- coding: utf-8 -*-
2014-10-12 21:56:44 +02:00
# Copyright 2015-2023 Mike Fährmann
2015-06-28 22:53:52 +02:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import sys
2015-06-28 22:53:52 +02:00
import re
modules = [
"2ch",
2017-07-14 08:44:31 +02:00
"2chan",
"2chen",
"35photo",
2015-06-28 22:53:52 +02:00
"3dbooru",
"4chan",
"4archive",
"4chanarchives",
"500px",
"8chan",
2019-06-10 22:17:46 +02:00
"8muses",
"adultempire",
"architizer",
"artstation",
"aryion",
"batoto",
"bbc",
2018-08-01 21:46:55 +02:00
"behance",
"blogger",
"bluesky",
"bunkr",
"catbox",
"chevereto",
"comicvine",
"cyberdrop",
2015-06-28 22:53:52 +02:00
"danbooru",
2021-09-17 20:09:24 +02:00
"desktopography",
2015-10-05 20:29:48 +02:00
"deviantart",
2016-09-22 17:20:57 +02:00
"dynastyscans",
"e621",
"erome",
2016-08-04 18:08:48 +02:00
"exhentai",
2017-02-06 20:05:58 +01:00
"fallenangels",
"fanbox",
"fanleaks",
"fantia",
"fapello",
"fapachi",
2017-05-30 17:43:02 +02:00
"flickr",
2020-02-11 19:51:24 +01:00
"furaffinity",
"fuskator",
2016-08-04 18:08:48 +02:00
"gelbooru",
"gelbooru_v01",
"gelbooru_v02",
"gofile",
2024-01-08 15:29:47 +01:00
"hatenablog",
2016-02-19 15:24:49 +01:00
"hentai2read",
2021-04-18 20:28:00 +02:00
"hentaicosplays",
2015-11-14 03:19:44 +01:00
"hentaifoundry",
"hentaifox",
"hentaihand",
"hentaihere",
"hiperdex",
2015-10-28 16:24:35 +01:00
"hitomi",
"hotleak",
"idolcomplex",
2015-06-28 22:53:52 +02:00
"imagebam",
"imagechest",
2016-08-09 14:05:12 +02:00
"imagefap",
2019-07-30 23:02:21 +02:00
"imgbb",
2015-06-28 22:53:52 +02:00
"imgbox",
2015-10-28 23:26:47 +01:00
"imgth",
2015-10-12 22:34:45 +02:00
"imgur",
"inkbunny",
"instagram",
"issuu",
"itaku",
2023-04-26 18:50:09 +02:00
"itchio",
2022-12-22 23:50:56 +01:00
"jschan",
"kabeuchi",
"keenspot",
"kemonoparty",
2016-04-20 08:34:44 +02:00
"khinsider",
"komikcast",
2023-05-26 09:22:23 +02:00
"lensdump",
"lexica",
"lightroom",
"livedoor",
2016-08-01 15:36:56 +02:00
"luscious",
"lynxchan",
"mangadex",
2017-01-14 19:39:21 +01:00
"mangafox",
2015-11-26 03:06:08 +01:00
"mangahere",
"mangakakalot",
2021-04-02 21:01:31 +02:00
"manganelo",
2015-12-08 22:29:34 +01:00
"mangapark",
"mangaread",
"mangasee",
"mangoxo",
2023-03-01 12:30:56 +01:00
"misskey",
"myhentaigallery",
"myportfolio",
"naver",
"naverwebtoon",
"newgrounds",
2015-10-28 12:08:27 +01:00
"nhentai",
2015-06-28 22:53:52 +02:00
"nijie",
"nitter",
"nozomi",
"nsfwalbum",
"paheal",
2019-05-16 23:56:48 +02:00
"patreon",
"philomena",
"photobucket",
"photovogue",
"picarto",
"piczel",
"pillowfort",
2016-09-02 19:11:16 +02:00
"pinterest",
"pixeldrain",
"pixiv",
"pixnet",
"plurk",
"poipiku",
2023-12-21 19:50:54 +01:00
"poringa",
"pornhub",
"pornpics",
2023-12-12 10:54:34 +01:00
"postmill",
"pururin",
"reactor",
"readcomiconline",
"reddit",
2020-05-10 00:31:42 +02:00
"redgifs",
"rule34us",
2015-11-09 02:29:33 +01:00
"sankaku",
"sankakucomplex",
2016-08-09 16:36:30 +02:00
"seiga",
2016-08-02 17:42:22 +02:00
"senmanga",
"sexcom",
"shimmie2",
"simplyhentai",
"skeb",
2019-06-09 21:59:22 +02:00
"slickpic",
"slideshare",
"smugmug",
"soundgasm",
"speakerdeck",
2024-01-09 07:12:56 +01:00
"steamgriddb",
"subscribestar",
"szurubooru",
"tapas",
"tcbscans",
"telegraph",
2023-11-18 01:34:34 +01:00
"tmohentai",
"toyhouse",
2019-01-29 17:23:01 +01:00
"tsumino",
2016-02-20 11:29:10 +01:00
"tumblr",
"tumblrgallery",
"twibooru",
2016-10-06 19:12:07 +02:00
"twitter",
2023-12-05 13:07:06 +01:00
"urlgalleries",
"unsplash",
"uploadir",
"urlshortener",
"vanillarock",
"vichan",
"vipergirls",
2021-03-23 18:48:01 +01:00
"vk",
2019-07-22 22:15:36 +02:00
"vsco",
"wallhaven",
"wallpapercave",
2017-08-18 19:52:58 +02:00
"warosu",
"weasyl",
"webmshare",
"webtoons",
"weibo",
"wikiart",
"wikifeet",
"wikimedia",
"xhamster",
2017-11-02 15:36:53 +01:00
"xvideos",
"zerochan",
2024-01-04 17:08:59 +01:00
"zzup",
"booru",
"moebooru",
"foolfuuka",
"foolslide",
"mastodon",
"shopify",
"lolisafe",
"imagehosts",
2017-05-24 12:51:18 +02:00
"directlink",
2016-10-01 15:54:27 +02:00
"recursive",
"oauth",
"test",
"ytdl",
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor.
2021-12-29 22:39:29 +01:00
"generic",
2015-06-28 22:53:52 +02:00
]
2017-02-01 00:53:19 +01:00
2015-10-05 17:52:50 +02:00
def find(url):
"""Find a suitable extractor for the given URL"""
for cls in _list_classes():
match = cls.pattern.match(url)
if match:
return cls(match)
2015-11-21 00:30:31 +01:00
return None
2015-06-28 22:53:52 +02:00
2017-02-01 00:53:19 +01:00
def add(cls):
"""Add 'cls' to the list of available extractors"""
cls.pattern = re.compile(cls.pattern)
_cache.append(cls)
return cls
def add_module(module):
"""Add all extractors in 'module' to the list of available extractors"""
classes = _get_classes(module)
for cls in classes:
cls.pattern = re.compile(cls.pattern)
_cache.extend(classes)
return classes
def extractors():
"""Yield all available extractor classes"""
return sorted(
_list_classes(),
key=lambda x: x.__name__
)
2017-02-01 00:53:19 +01:00
2015-06-28 22:53:52 +02:00
# --------------------------------------------------------------------
# internals
2017-02-01 00:53:19 +01:00
def _list_classes():
"""Yield available extractor classes"""
2016-08-23 16:36:39 +02:00
yield from _cache
2015-10-05 18:10:18 +02:00
for module in _module_iter:
yield from add_module(module)
2015-11-20 19:54:07 +01:00
globals()["_list_classes"] = lambda : _cache
def _modules_internal():
globals_ = globals()
for module_name in modules:
yield __import__(module_name, globals_, None, (), 1)
def _modules_path(path, files):
sys.path.insert(0, path)
try:
return [
__import__(name[:-3])
for name in files
if name.endswith(".py")
]
finally:
del sys.path[0]
2017-02-01 00:53:19 +01:00
2015-11-20 19:54:07 +01:00
def _get_classes(module):
"""Return a list of all extractor classes in a module"""
return [
cls for cls in module.__dict__.values() if (
hasattr(cls, "pattern") and cls.__module__ == module.__name__
2015-11-20 19:54:07 +01:00
)
]
_cache = []
_module_iter = _modules_internal()