mirror of
https://github.com/mikf/gallery-dl.git
synced 2025-01-31 19:51:34 +01:00
replace 'text.extract()' with 'text.extr()' where possible
This commit is contained in:
parent
eb33e6cf2d
commit
b0cb4a1b9c
@ -60,8 +60,8 @@ class _2chanThreadExtractor(Extractor):
|
||||
|
||||
def metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
title = text.extract(page, "<title>", "</title>")[0]
|
||||
title, _, boardname = title.rpartition(" - ")
|
||||
title, _, boardname = text.extr(
|
||||
page, "<title>", "</title>").rpartition(" - ")
|
||||
return {
|
||||
"server": self.server,
|
||||
"title": title,
|
||||
@ -72,8 +72,8 @@ class _2chanThreadExtractor(Extractor):
|
||||
|
||||
def posts(self, page):
|
||||
"""Build a list of all post-objects"""
|
||||
page = text.extract(
|
||||
page, '<div class="thre"', '<div style="clear:left"></div>')[0]
|
||||
page = text.extr(
|
||||
page, '<div class="thre"', '<div style="clear:left"></div>')
|
||||
return [
|
||||
self.parse(post)
|
||||
for post in page.split('<table border=0>')
|
||||
@ -84,7 +84,7 @@ class _2chanThreadExtractor(Extractor):
|
||||
data = self._extract_post(post)
|
||||
if data["name"]:
|
||||
data["name"] = data["name"].strip()
|
||||
path = text.extract(post, '<a href="/', '"')[0]
|
||||
path = text.extr(post, '<a href="/', '"')
|
||||
if path and not path.startswith("bin/jump"):
|
||||
self._extract_image(post, data)
|
||||
data["tim"], _, data["extension"] = data["filename"].partition(".")
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2019-2021 Mike Fährmann
|
||||
# Copyright 2019-2022 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@ -124,7 +124,7 @@ class _35photoUserExtractor(_35photoExtractor):
|
||||
def metadata(self):
|
||||
url = "{}/{}/".format(self.root, self.user)
|
||||
page = self.request(url).text
|
||||
self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0])
|
||||
self.user_id = text.parse_int(text.extr(page, "/user_", ".xml"))
|
||||
return {
|
||||
"user": self.user,
|
||||
"user_id": self.user_id,
|
||||
@ -189,10 +189,10 @@ class _35photoGenreExtractor(_35photoExtractor):
|
||||
def metadata(self):
|
||||
url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/")
|
||||
page = self.request(url).text
|
||||
self.photo_ids = self._photo_ids(text.extract(
|
||||
page, ' class="photo', '\n')[0])
|
||||
self.photo_ids = self._photo_ids(text.extr(
|
||||
page, ' class="photo', '\n'))
|
||||
return {
|
||||
"genre": text.extract(page, " genre - ", ". ")[0],
|
||||
"genre": text.extr(page, " genre - ", ". "),
|
||||
"genre_id": text.parse_int(self.genre_id),
|
||||
}
|
||||
|
||||
|
@ -76,9 +76,9 @@ class _8musesAlbumExtractor(Extractor):
|
||||
url = self.root + self.path + self.params
|
||||
|
||||
while True:
|
||||
data = self._unobfuscate(text.extract(
|
||||
data = self._unobfuscate(text.extr(
|
||||
self.request(url).text,
|
||||
'id="ractive-public" type="text/plain">', '</script>')[0])
|
||||
'id="ractive-public" type="text/plain">', '</script>'))
|
||||
|
||||
images = data.get("pictures")
|
||||
if images:
|
||||
|
@ -41,8 +41,8 @@ class ArtstationExtractor(Extractor):
|
||||
|
||||
if adict["has_embedded_player"] and self.external:
|
||||
player = adict["player_embedded"]
|
||||
url = text.extract(player, 'src="', '"')[0] or \
|
||||
text.extract(player, "src='", "'")[0]
|
||||
url = (text.extr(player, 'src="', '"') or
|
||||
text.extr(player, "src='", "'"))
|
||||
if url and not url.startswith(self.root):
|
||||
asset["extension"] = None
|
||||
yield Message.Url, "ytdl:" + url, asset
|
||||
|
@ -128,8 +128,7 @@ class AryionExtractor(Extractor):
|
||||
|
||||
# get filename from 'Content-Disposition' header
|
||||
cdis = headers["content-disposition"]
|
||||
fname, _, ext = text.extract(
|
||||
cdis, 'filename="', '"')[0].rpartition(".")
|
||||
fname, _, ext = text.extr(cdis, 'filename="', '"').rpartition(".")
|
||||
if not fname:
|
||||
fname, ext = ext, fname
|
||||
|
||||
|
@ -38,8 +38,8 @@ class BbcGalleryExtractor(GalleryExtractor):
|
||||
)
|
||||
|
||||
def metadata(self, page):
|
||||
data = json.loads(text.extract(
|
||||
page, '<script type="application/ld+json">', '</script>')[0])
|
||||
data = json.loads(text.extr(
|
||||
page, '<script type="application/ld+json">', '</script>'))
|
||||
return {
|
||||
"programme": self.gallery_url.split("/")[4],
|
||||
"path": list(util.unique_sequence(
|
||||
|
@ -97,7 +97,7 @@ class BcyExtractor(Extractor):
|
||||
url = "{}/item/detail/{}".format(self.root, post_id)
|
||||
page = self.request(url, notfound="post").text
|
||||
return json.loads(
|
||||
text.extract(page, 'JSON.parse("', '");')[0]
|
||||
text.extr(page, 'JSON.parse("', '");')
|
||||
.replace('\\\\u002F', '/')
|
||||
.replace('\\"', '"')
|
||||
)["detail"]
|
||||
|
@ -119,8 +119,8 @@ class BehanceGalleryExtractor(BehanceExtractor):
|
||||
}
|
||||
page = self.request(url, cookies=cookies).text
|
||||
|
||||
data = json.loads(text.extract(
|
||||
page, 'id="beconfig-store_state">', '</script>')[0])
|
||||
data = json.loads(text.extr(
|
||||
page, 'id="beconfig-store_state">', '</script>'))
|
||||
return self._update(data["project"]["project"])
|
||||
|
||||
def get_images(self, data):
|
||||
@ -137,7 +137,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
|
||||
|
||||
elif mtype == "video":
|
||||
page = self.request(module["src"]).text
|
||||
url = text.extract(page, '<source src="', '"')[0]
|
||||
url = text.extr(page, '<source src="', '"')
|
||||
if text.ext_from_url(url) == "m3u8":
|
||||
url = "ytdl:" + url
|
||||
append((url, module))
|
||||
@ -150,8 +150,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
|
||||
elif mtype == "embed":
|
||||
embed = module.get("original_embed") or module.get("embed")
|
||||
if embed:
|
||||
url = "ytdl:" + text.extract(embed, 'src="', '"')[0]
|
||||
append((url, module))
|
||||
append(("ytdl:" + text.extr(embed, 'src="', '"'), module))
|
||||
|
||||
return result
|
||||
|
||||
|
@ -61,8 +61,8 @@ class BloggerExtractor(Extractor):
|
||||
page = self.request(post["url"]).text
|
||||
for url in findall_video(page):
|
||||
page = self.request(url).text
|
||||
video_config = json.loads(text.extract(
|
||||
page, 'var VIDEO_CONFIG =', '\n')[0])
|
||||
video_config = json.loads(text.extr(
|
||||
page, 'var VIDEO_CONFIG =', '\n'))
|
||||
files.append(max(
|
||||
video_config["streams"],
|
||||
key=lambda x: x["format_id"],
|
||||
|
@ -68,9 +68,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
|
||||
url = self.root + "/a/" + self.album_id
|
||||
|
||||
try:
|
||||
data = json.loads(text.extract(
|
||||
data = json.loads(text.extr(
|
||||
self.request(url).text,
|
||||
'id="__NEXT_DATA__" type="application/json">', '<')[0])
|
||||
'id="__NEXT_DATA__" type="application/json">', '<'))
|
||||
album = data["props"]["pageProps"]["album"]
|
||||
files = album["files"]
|
||||
except Exception as exc:
|
||||
|
@ -603,22 +603,22 @@ class DeviantartStashExtractor(DeviantartExtractor):
|
||||
page = self._limited_request(url).text
|
||||
|
||||
if stash_id[0] == "0":
|
||||
uuid = text.extract(page, '//deviation/', '"')[0]
|
||||
uuid = text.extr(page, '//deviation/', '"')
|
||||
if uuid:
|
||||
deviation = self.api.deviation(uuid)
|
||||
deviation["index"] = text.parse_int(text.extract(
|
||||
page, 'gmi-deviationid="', '"')[0])
|
||||
deviation["index"] = text.parse_int(text.extr(
|
||||
page, 'gmi-deviationid="', '"'))
|
||||
yield deviation
|
||||
return
|
||||
|
||||
for item in text.extract_iter(
|
||||
page, 'class="stash-thumb-container', '</div>'):
|
||||
url = text.extract(item, '<a href="', '"')[0]
|
||||
url = text.extr(item, '<a href="', '"')
|
||||
|
||||
if url:
|
||||
stash_id = url.rpartition("/")[2]
|
||||
else:
|
||||
stash_id = text.extract(item, 'gmi-stashid="', '"')[0]
|
||||
stash_id = text.extr(item, 'gmi-stashid="', '"')
|
||||
stash_id = "2" + util.bencode(text.parse_int(
|
||||
stash_id), "0123456789abcdefghijklmnopqrstuvwxyz")
|
||||
|
||||
@ -1484,8 +1484,8 @@ class DeviantartEclipseAPI():
|
||||
def _fetch_csrf_token(self, page=None):
|
||||
if page is None:
|
||||
page = self.request(self.extractor.root + "/").text
|
||||
self.csrf_token = token = text.extract(
|
||||
page, "window.__CSRF_TOKEN__ = '", "'")[0]
|
||||
self.csrf_token = token = text.extr(
|
||||
page, "window.__CSRF_TOKEN__ = '", "'")
|
||||
return token
|
||||
|
||||
|
||||
|
@ -30,7 +30,7 @@ class DynastyscansBase():
|
||||
src = extr("class='btn-group'>", "</div>")
|
||||
url = extr(' src="', '"')
|
||||
|
||||
src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
|
||||
src = text.extr(src, 'href="', '"') if "Source<" in src else ""
|
||||
|
||||
return {
|
||||
"url" : self.root + url,
|
||||
@ -75,7 +75,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
|
||||
"title" : text.unescape(match.group(4) or ""),
|
||||
"author" : text.remove_html(author),
|
||||
"group" : (text.remove_html(group) or
|
||||
text.extract(group, ' alt="', '"')[0] or ""),
|
||||
text.extr(group, ' alt="', '"')),
|
||||
"date" : text.parse_datetime(extr(
|
||||
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
|
||||
"lang" : "en",
|
||||
@ -83,7 +83,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
data = text.extract(page, "var pages = ", ";\n")[0]
|
||||
data = text.extr(page, "var pages = ", ";\n")
|
||||
return [
|
||||
(self.root + img["image"], None)
|
||||
for img in json.loads(data)
|
||||
|
@ -55,8 +55,8 @@ class EromeExtractor(Extractor):
|
||||
yield Message.Directory, data
|
||||
groups = page.split('<div class="media-group"')
|
||||
for data["num"], group in enumerate(util.advance(groups, 1), 1):
|
||||
url = (text.extract(group, '<source src="', '"')[0] or
|
||||
text.extract(group, 'data-src="', '"')[0])
|
||||
url = (text.extr(group, '<source src="', '"') or
|
||||
text.extr(group, 'data-src="', '"'))
|
||||
if url:
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
|
@ -185,7 +185,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
||||
|
||||
if self.gallery_token:
|
||||
gpage = self._gallery_page()
|
||||
self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
|
||||
self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
|
||||
if not self.image_token:
|
||||
self.log.error("Failed to extract initial image token")
|
||||
self.log.debug("Page content:\n%s", gpage)
|
||||
@ -193,7 +193,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
||||
ipage = self._image_page()
|
||||
else:
|
||||
ipage = self._image_page()
|
||||
part = text.extract(ipage, 'hentai.org/g/', '"')[0]
|
||||
part = text.extr(ipage, 'hentai.org/g/', '"')
|
||||
if not part:
|
||||
self.log.error("Failed to extract gallery token")
|
||||
self.log.debug("Page content:\n%s", ipage)
|
||||
@ -271,8 +271,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
||||
}
|
||||
|
||||
if data["uploader"].startswith("<"):
|
||||
data["uploader"] = text.unescape(text.extract(
|
||||
data["uploader"], ">", "<")[0])
|
||||
data["uploader"] = text.unescape(text.extr(
|
||||
data["uploader"], ">", "<"))
|
||||
|
||||
f = data["favorites"][0]
|
||||
if f == "N":
|
||||
@ -400,7 +400,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
||||
}
|
||||
|
||||
page = self.request(url, cookies=cookies).text
|
||||
current = text.extract(page, "<strong>", "</strong>")[0]
|
||||
current = text.extr(page, "<strong>", "</strong>")
|
||||
self.log.debug("Image Limits: %s/%s", current, self.limits)
|
||||
self._remaining = self.limits - text.parse_int(current)
|
||||
|
||||
|
@ -57,7 +57,7 @@ class FallenangelsChapterExtractor(ChapterExtractor):
|
||||
return [
|
||||
(img["page_image"], None)
|
||||
for img in json.loads(
|
||||
text.extract(page, "var pages = ", ";")[0]
|
||||
text.extr(page, "var pages = ", ";")
|
||||
)
|
||||
]
|
||||
|
||||
|
@ -56,7 +56,7 @@ class FoolfuukaExtractor(BaseExtractor):
|
||||
"""Resolve a remote media link"""
|
||||
needle = '<meta http-equiv="Refresh" content="0; url='
|
||||
page = self.request(media["remote_media_link"]).text
|
||||
return text.extract(page, needle, '"')[0]
|
||||
return text.extr(page, needle, '"')
|
||||
|
||||
@staticmethod
|
||||
def _remote_direct(media):
|
||||
|
@ -114,7 +114,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
|
||||
})
|
||||
|
||||
def images(self, page):
|
||||
return json.loads(text.extract(page, "var pages = ", ";")[0])
|
||||
return json.loads(text.extr(page, "var pages = ", ";"))
|
||||
|
||||
|
||||
class FoolslideMangaExtractor(FoolslideExtractor):
|
||||
|
@ -160,7 +160,7 @@ class FuraffinityExtractor(Extractor):
|
||||
while path:
|
||||
page = self.request(self.root + path).text
|
||||
yield from text.extract_iter(page, 'id="sid-', '"')
|
||||
path = text.extract(page, 'right" href="', '"')[0]
|
||||
path = text.extr(page, 'right" href="', '"')
|
||||
|
||||
def _pagination_search(self, query):
|
||||
url = self.root + "/search/"
|
||||
|
@ -58,7 +58,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
|
||||
self.root + "/ajax/gal.aspx", params=params, headers=headers,
|
||||
).json()
|
||||
|
||||
title = text.extract(page, "<title>", "</title>")[0].strip()
|
||||
title = text.extr(page, "<title>", "</title>").strip()
|
||||
title, _, gallery_id = title.rpartition("#")
|
||||
|
||||
return {
|
||||
@ -104,7 +104,7 @@ class FuskatorSearchExtractor(Extractor):
|
||||
page, 'class="pic_pad"><a href="', '"'):
|
||||
yield Message.Queue, self.root + path, data
|
||||
|
||||
pages = text.extract(page, 'class="pages"><span>', '>>><')[0]
|
||||
pages = text.extr(page, 'class="pages"><span>', '>>><')
|
||||
if not pages:
|
||||
return
|
||||
url = self.root + text.rextract(pages, 'href="', '"')[0]
|
||||
|
@ -69,7 +69,7 @@ class GelbooruBase():
|
||||
yield "https://img1.gelbooru.com" + path
|
||||
|
||||
def _notes(self, post, page):
|
||||
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
|
||||
notes_data = text.extr(page, '<section id="notes"', '</section>')
|
||||
if not notes_data:
|
||||
return
|
||||
|
||||
|
@ -98,8 +98,8 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
||||
self.root, post["id"])).text
|
||||
|
||||
def _tags(self, post, page):
|
||||
tag_container = (text.extract(page, '<ul id="tag-', '</ul>')[0] or
|
||||
text.extract(page, '<ul class="tag-', '</ul>')[0])
|
||||
tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or
|
||||
text.extr(page, '<ul class="tag-', '</ul>'))
|
||||
if not tag_container:
|
||||
return
|
||||
|
||||
@ -112,7 +112,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
||||
post["tags_" + key] = " ".join(value)
|
||||
|
||||
def _notes(self, post, page):
|
||||
note_container = text.extract(page, 'id="note-container"', "<img ")[0]
|
||||
note_container = text.extr(page, 'id="note-container"', "<img ")
|
||||
if not note_container:
|
||||
return
|
||||
|
||||
|
@ -87,25 +87,25 @@ class GenericExtractor(Extractor):
|
||||
"""Extract generic webpage metadata, return them in a dict."""
|
||||
data = {}
|
||||
data['pageurl'] = self.url
|
||||
data['title'] = text.extract(page, '<title>', "</title>")[0] or ""
|
||||
data['description'] = text.extract(
|
||||
page, '<meta name="description" content="', '"')[0] or ""
|
||||
data['keywords'] = text.extract(
|
||||
page, '<meta name="keywords" content="', '"')[0] or ""
|
||||
data['language'] = text.extract(
|
||||
page, '<meta name="language" content="', '"')[0] or ""
|
||||
data['name'] = text.extract(
|
||||
page, '<meta itemprop="name" content="', '"')[0] or ""
|
||||
data['copyright'] = text.extract(
|
||||
page, '<meta name="copyright" content="', '"')[0] or ""
|
||||
data['og_site'] = text.extract(
|
||||
page, '<meta property="og:site" content="', '"')[0] or ""
|
||||
data['og_site_name'] = text.extract(
|
||||
page, '<meta property="og:site_name" content="', '"')[0] or ""
|
||||
data['og_title'] = text.extract(
|
||||
page, '<meta property="og:title" content="', '"')[0] or ""
|
||||
data['og_description'] = text.extract(
|
||||
page, '<meta property="og:description" content="', '"')[0] or ""
|
||||
data['title'] = text.extr(page, '<title>', "</title>")
|
||||
data['description'] = text.extr(
|
||||
page, '<meta name="description" content="', '"')
|
||||
data['keywords'] = text.extr(
|
||||
page, '<meta name="keywords" content="', '"')
|
||||
data['language'] = text.extr(
|
||||
page, '<meta name="language" content="', '"')
|
||||
data['name'] = text.extr(
|
||||
page, '<meta itemprop="name" content="', '"')
|
||||
data['copyright'] = text.extr(
|
||||
page, '<meta name="copyright" content="', '"')
|
||||
data['og_site'] = text.extr(
|
||||
page, '<meta property="og:site" content="', '"')
|
||||
data['og_site_name'] = text.extr(
|
||||
page, '<meta property="og:site_name" content="', '"')
|
||||
data['og_title'] = text.extr(
|
||||
page, '<meta property="og:title" content="', '"')
|
||||
data['og_description'] = text.extr(
|
||||
page, '<meta property="og:description" content="', '"')
|
||||
|
||||
data = {k: text.unescape(data[k]) for k in data if data[k] != ""}
|
||||
|
||||
|
@ -60,7 +60,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor):
|
||||
self.session.headers["Referer"] = url
|
||||
|
||||
def metadata(self, page):
|
||||
title = text.extract(page, "<title>", "</title>")[0]
|
||||
title = text.extr(page, "<title>", "</title>")
|
||||
return {
|
||||
"title": text.unescape(title.rpartition(" Story Viewer - ")[0]),
|
||||
"slug" : self.slug,
|
||||
|
@ -156,8 +156,8 @@ class HentaifoundryExtractor(Extractor):
|
||||
"filter_media" : "A",
|
||||
"filter_order" : "date_new",
|
||||
"filter_type" : "0",
|
||||
"YII_CSRF_TOKEN" : text.unquote(text.extract(
|
||||
csrf_token, "%22", "%22")[0]),
|
||||
"YII_CSRF_TOKEN" : text.unquote(text.extr(
|
||||
csrf_token, "%22", "%22")),
|
||||
}
|
||||
self.request(url, method="POST", data=data)
|
||||
|
||||
|
@ -57,8 +57,8 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
|
||||
ChapterExtractor.__init__(self, match, url)
|
||||
|
||||
def metadata(self, page):
|
||||
title = text.extract(page, "<title>", "</title>")[0]
|
||||
chapter_id = text.extract(page, 'report/C', '"')[0]
|
||||
title = text.extr(page, "<title>", "</title>")
|
||||
chapter_id = text.extr(page, 'report/C', '"')
|
||||
chapter, sep, minor = self.chapter.partition(".")
|
||||
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
|
||||
match = re.match(pattern, title)
|
||||
@ -77,7 +77,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
|
||||
|
||||
@staticmethod
|
||||
def images(page):
|
||||
images = text.extract(page, "var rff_imageList = ", ";")[0]
|
||||
images = text.extr(page, "var rff_imageList = ", ";")
|
||||
return [
|
||||
("https://hentaicdn.com/hentai" + part, None)
|
||||
for part in json.loads(images)
|
||||
|
@ -139,7 +139,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
|
||||
self.manga_data(self.manga, page)
|
||||
results = []
|
||||
|
||||
shortlink = text.extract(page, "rel='shortlink' href='", "'")[0]
|
||||
shortlink = text.extr(page, "rel='shortlink' href='", "'")
|
||||
data = {
|
||||
"action" : "manga_get_reading_nav",
|
||||
"manga" : shortlink.rpartition("=")[2],
|
||||
@ -182,6 +182,6 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
|
||||
def chapters(self, page):
|
||||
results = []
|
||||
for info in text.extract_iter(page, 'id="manga-item-', '<img'):
|
||||
url = text.extract(info, 'href="', '"')[0]
|
||||
url = text.extr(info, 'href="', '"')
|
||||
results.append((url, {}))
|
||||
return results
|
||||
|
@ -44,7 +44,7 @@ class HotleakExtractor(Extractor):
|
||||
|
||||
for item in text.extract_iter(
|
||||
page, '<article class="movie-item', '</article>'):
|
||||
yield text.extract(item, '<a href="', '"')[0]
|
||||
yield text.extr(item, '<a href="', '"')
|
||||
|
||||
params["page"] += 1
|
||||
|
||||
@ -87,8 +87,8 @@ class HotleakPostExtractor(HotleakExtractor):
|
||||
url = "{}/{}/{}/{}".format(
|
||||
self.root, self.creator, self.type, self.id)
|
||||
page = self.request(url).text
|
||||
page = text.extract(
|
||||
page, '<div class="movie-image thumb">', '</article>')[0]
|
||||
page = text.extr(
|
||||
page, '<div class="movie-image thumb">', '</article>')
|
||||
data = {
|
||||
"id" : text.parse_int(self.id),
|
||||
"creator": self.creator,
|
||||
@ -96,12 +96,12 @@ class HotleakPostExtractor(HotleakExtractor):
|
||||
}
|
||||
|
||||
if self.type == "photo":
|
||||
data["url"] = text.extract(page, 'data-src="', '"')[0]
|
||||
data["url"] = text.extr(page, 'data-src="', '"')
|
||||
text.nameext_from_url(data["url"], data)
|
||||
|
||||
elif self.type == "video":
|
||||
data["url"] = "ytdl:" + text.extract(
|
||||
text.unescape(page), '"src":"', '"')[0]
|
||||
data["url"] = "ytdl:" + text.extr(
|
||||
text.unescape(page), '"src":"', '"')
|
||||
text.nameext_from_url(data["url"], data)
|
||||
data["extension"] = "mp4"
|
||||
|
||||
|
@ -115,7 +115,7 @@ class IdolcomplexExtractor(SankakuExtractor):
|
||||
|
||||
if self.extags:
|
||||
tags = collections.defaultdict(list)
|
||||
tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
|
||||
tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>')
|
||||
pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
|
||||
for tag_type, tag_name in pattern.findall(tags_html or ""):
|
||||
tags[tag_type].append(text.unquote(tag_name))
|
||||
|
@ -83,8 +83,8 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
|
||||
|
||||
@staticmethod
|
||||
def metadata(page):
|
||||
return {"title": text.unescape(text.extract(
|
||||
page, 'id="gallery-name">', '<')[0].strip())}
|
||||
return {"title": text.unescape(text.extr(
|
||||
page, 'id="gallery-name">', '<').strip())}
|
||||
|
||||
def images(self, page):
|
||||
findall = re.compile(r'<a href="https://www\.imagebam\.com'
|
||||
|
@ -36,8 +36,8 @@ class ImagechestGalleryExtractor(GalleryExtractor):
|
||||
|
||||
return {
|
||||
"gallery_id": self.gallery_id,
|
||||
"title": text.unescape(text.extract(
|
||||
page, 'property="og:title" content="', '"')[0].strip())
|
||||
"title": text.unescape(text.extr(
|
||||
page, 'property="og:title" content="', '"').strip())
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
|
@ -202,7 +202,7 @@ class ImagefapUserExtractor(ImagefapExtractor):
|
||||
|
||||
response = self.request(url)
|
||||
self.user = response.url.split("/")[-2]
|
||||
folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0]
|
||||
folders = text.extr(response.text, ' id="tgl_all" value="', '"')
|
||||
return folders.rstrip("|").split("|")
|
||||
|
||||
def galleries(self, folder_id):
|
||||
|
@ -259,7 +259,7 @@ class ViprImageExtractor(ImagehostImageExtractor):
|
||||
})
|
||||
|
||||
def get_info(self, page):
|
||||
url = text.extract(page, '<img src="', '"')[0]
|
||||
url = text.extr(page, '<img src="', '"')
|
||||
return url, url
|
||||
|
||||
|
||||
|
@ -71,7 +71,7 @@ class ImgbbExtractor(Extractor):
|
||||
|
||||
url = self.root + "/login"
|
||||
page = self.request(url).text
|
||||
token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]
|
||||
token = text.extr(page, 'PF.obj.config.auth_token="', '"')
|
||||
|
||||
headers = {"Referer": url}
|
||||
data = {
|
||||
@ -154,7 +154,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
url = text.extract(page, '"og:url" content="', '"')[0]
|
||||
url = text.extr(page, '"og:url" content="', '"')
|
||||
album_id = url.rpartition("/")[2].partition("?")[0]
|
||||
|
||||
return self._pagination(page, "https://ibb.co/json", {
|
||||
@ -185,7 +185,7 @@ class ImgbbUserExtractor(ImgbbExtractor):
|
||||
return {"user": self.user}
|
||||
|
||||
def images(self, page):
|
||||
user = text.extract(page, '.obj.resource={"id":"', '"')[0]
|
||||
user = text.extr(page, '.obj.resource={"id":"', '"')
|
||||
return self._pagination(page, self.page_url + "json", {
|
||||
"from" : "user",
|
||||
"userid" : user,
|
||||
|
@ -53,7 +53,7 @@ class ImgboxExtractor(Extractor):
|
||||
@staticmethod
|
||||
def get_image_url(page):
|
||||
"""Extract download-url"""
|
||||
return text.extract(page, 'property="og:image" content="', '"')[0]
|
||||
return text.extr(page, 'property="og:image" content="', '"')
|
||||
|
||||
|
||||
class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
|
||||
@ -89,7 +89,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
|
||||
raise exception.NotFoundError("gallery")
|
||||
self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page)
|
||||
|
||||
title = text.extract(page, "<h1>", "</h1>")[0]
|
||||
title = text.extr(page, "<h1>", "</h1>")
|
||||
title, _, count = title.rpartition(" - ")
|
||||
return {
|
||||
"gallery_key": self.gallery_key,
|
||||
|
@ -41,7 +41,7 @@ class ImgthGalleryExtractor(Extractor):
|
||||
"""Yield all image urls for this gallery"""
|
||||
pnum = 0
|
||||
while True:
|
||||
thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
|
||||
thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>')
|
||||
for url in text.extract_iter(thumbs, '<img src="', '"'):
|
||||
yield "https://imgth.com/images" + url[24:]
|
||||
if '<li class="next">' not in page:
|
||||
|
@ -236,7 +236,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor):
|
||||
# get user_id from user profile
|
||||
url = "{}/{}".format(self.root, favsby)
|
||||
page = self.request(url).text
|
||||
user_id = text.extract(page, "?user_id=", "'")[0]
|
||||
user_id = text.extr(page, "?user_id=", "'")
|
||||
params["favs_user_id"] = user_id.partition("&")[0]
|
||||
|
||||
return self.api.search(params)
|
||||
|
@ -54,8 +54,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
|
||||
})
|
||||
|
||||
def metadata(self, page):
|
||||
data = json.loads(text.extract(
|
||||
page, '<script data-json="', '"')[0].replace(""", '"'))
|
||||
data = json.loads(text.extr(
|
||||
page, '<script data-json="', '"').replace(""", '"'))
|
||||
|
||||
doc = data["initialDocumentData"]["document"]
|
||||
doc["date"] = text.parse_datetime(
|
||||
|
@ -62,7 +62,7 @@ class KabeuchiUserExtractor(Extractor):
|
||||
response = self.request(url)
|
||||
if response.history and response.url == self.root + "/":
|
||||
raise exception.NotFoundError("user")
|
||||
target_id = text.extract(response.text, 'user_friend_id = "', '"')[0]
|
||||
target_id = text.extr(response.text, 'user_friend_id = "', '"')
|
||||
return self._pagination(target_id)
|
||||
|
||||
def _pagination(self, target_id):
|
||||
|
@ -96,7 +96,7 @@ class KeenspotComicExtractor(Extractor):
|
||||
self._image = '<div id="comic">'
|
||||
return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
|
||||
|
||||
url = text.extract(page, '<link rel="first" href="', '"')[0]
|
||||
url = text.extr(page, '<link rel="first" href="', '"')
|
||||
if url:
|
||||
if self.comic == "porcelain":
|
||||
self._needle = 'id="porArchivetop_"'
|
||||
@ -144,7 +144,7 @@ class KeenspotComicExtractor(Extractor):
|
||||
|
||||
@staticmethod
|
||||
def _next_link(page):
|
||||
return text.extract(page, '<link rel="next" href="', '"')[0]
|
||||
return text.extr(page, '<link rel="next" href="', '"')
|
||||
|
||||
@staticmethod
|
||||
def _next_id(page):
|
||||
|
@ -192,7 +192,7 @@ class KemonopartyExtractor(Extractor):
|
||||
"body": text.unescape(text.extract(
|
||||
dm, "<pre>", "</pre></",
|
||||
)[0].strip()),
|
||||
"date": text.extract(dm, 'datetime="', '"')[0],
|
||||
"date": text.extr(dm, 'datetime="', '"'),
|
||||
})
|
||||
return dms
|
||||
|
||||
|
@ -76,7 +76,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
|
||||
else:
|
||||
fmt = fmt.lower().split(",")
|
||||
|
||||
page = text.extract(page, '<table id="songlist">', '</table>')[0]
|
||||
page = text.extr(page, '<table id="songlist">', '</table>')
|
||||
for num, url in enumerate(text.extract_iter(
|
||||
page, '<td class="clickable-row"><a href="', '"'), 1):
|
||||
url = text.urljoin(self.root, url)
|
||||
|
@ -35,8 +35,8 @@ class KissgoddessGalleryExtractor(GalleryExtractor):
|
||||
def metadata(self, page):
|
||||
return {
|
||||
"gallery_id": text.parse_int(self.gallery_id),
|
||||
"title" : text.extract(
|
||||
page, '<title>', "<")[0].rpartition(" | ")[0],
|
||||
"title" : text.extr(
|
||||
page, '<title>', "<")[0].rpartition(" | "),
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
|
@ -62,13 +62,13 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
|
||||
)
|
||||
|
||||
def metadata(self, page):
|
||||
info = text.extract(page, "<title>", " – Komikcast<")[0]
|
||||
info = text.extr(page, "<title>", " – Komikcast<")
|
||||
return self.parse_chapter_string(info)
|
||||
|
||||
@staticmethod
|
||||
def images(page):
|
||||
readerarea = text.extract(
|
||||
page, '<div class="main-reading-area', '</div')[0]
|
||||
readerarea = text.extr(
|
||||
page, '<div class="main-reading-area', '</div')
|
||||
return [
|
||||
(text.unescape(url), None)
|
||||
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
|
||||
|
@ -47,7 +47,7 @@ class LightroomGalleryExtractor(Extractor):
|
||||
url = "https://lightroom.adobe.com/shares/" + self.href
|
||||
response = self.request(url)
|
||||
album = json.loads(
|
||||
text.extract(response.text, "albumAttributes: ", "\n")[0]
|
||||
text.extr(response.text, "albumAttributes: ", "\n")
|
||||
)
|
||||
|
||||
images = self.images(album)
|
||||
|
@ -22,8 +22,8 @@ class LineblogBase():
|
||||
body = post.pop("body")
|
||||
|
||||
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
|
||||
src = text.extract(img, 'src="', '"')[0]
|
||||
alt = text.extract(img, 'alt="', '"')[0]
|
||||
src = text.extr(img, 'src="', '"')
|
||||
alt = text.extr(img, 'alt="', '"')
|
||||
|
||||
if not src:
|
||||
continue
|
||||
|
@ -37,7 +37,7 @@ class LivedoorExtractor(Extractor):
|
||||
|
||||
def _load(self, data, body):
|
||||
extr = text.extract_from(data)
|
||||
tags = text.extract(body, 'class="article-tags">', '</dl>')[0]
|
||||
tags = text.extr(body, 'class="article-tags">', '</dl>')
|
||||
about = extr('rdf:about="', '"')
|
||||
|
||||
return {
|
||||
@ -57,8 +57,8 @@ class LivedoorExtractor(Extractor):
|
||||
body = post.pop("body")
|
||||
|
||||
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
|
||||
src = text.extract(img, 'src="', '"')[0]
|
||||
alt = text.extract(img, 'alt="', '"')[0]
|
||||
src = text.extr(img, 'src="', '"')
|
||||
alt = text.extr(img, 'alt="', '"')
|
||||
|
||||
if not src:
|
||||
continue
|
||||
|
@ -63,8 +63,8 @@ class ManganeloChapterExtractor(ChapterExtractor):
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
page = text.extract(
|
||||
page, 'class="container-chapter-reader', '\n<div')[0]
|
||||
page = text.extr(
|
||||
page, 'class="container-chapter-reader', '\n<div')
|
||||
return [
|
||||
(url, None)
|
||||
for url in text.extract_iter(page, '<img src="', '"')
|
||||
|
@ -104,7 +104,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
|
||||
return data
|
||||
|
||||
def images(self, page):
|
||||
data = json.loads(text.extract(page, "var _load_pages =", ";")[0])
|
||||
data = json.loads(text.extr(page, "var _load_pages =", ";"))
|
||||
return [
|
||||
(text.urljoin(self.root, item["u"]), {
|
||||
"width": text.parse_int(item["w"]),
|
||||
@ -136,10 +136,10 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
|
||||
results = []
|
||||
data = {"lang": "en", "language": "English"}
|
||||
data["manga"] = text.unescape(
|
||||
text.extract(page, '<title>', ' Manga - ')[0])
|
||||
text.extr(page, '<title>', ' Manga - '))
|
||||
|
||||
for stream in page.split('<div id="stream_')[1:]:
|
||||
data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])
|
||||
data["stream"] = text.parse_int(text.extr(stream, '', '"'))
|
||||
|
||||
for chapter in text.extract_iter(stream, '<li ', '</li>'):
|
||||
path , pos = text.extract(chapter, 'href="', '"')
|
||||
|
@ -38,7 +38,7 @@ class MangoxoExtractor(Extractor):
|
||||
|
||||
url = self.root + "/login"
|
||||
page = self.request(url).text
|
||||
token = text.extract(page, 'id="loginToken" value="', '"')[0]
|
||||
token = text.extr(page, 'id="loginToken" value="', '"')
|
||||
|
||||
url = self.root + "/api/login"
|
||||
headers = {
|
||||
@ -115,7 +115,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
|
||||
|
||||
data["extension"] = None
|
||||
for data["num"], path in enumerate(imgs, 1):
|
||||
data["id"] = text.parse_int(text.extract(path, "=", "&")[0])
|
||||
data["id"] = text.parse_int(text.extr(path, "=", "&"))
|
||||
url = self.root + "/external/" + path.rpartition("url=")[2]
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
|
@ -31,7 +31,7 @@ class MoebooruExtractor(BooruExtractor):
|
||||
self.root, post["id"])).text
|
||||
|
||||
def _tags(self, post, page):
|
||||
tag_container = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
||||
tag_container = text.extr(page, '<ul id="tag-', '</ul>')
|
||||
if not tag_container:
|
||||
return
|
||||
|
||||
@ -43,7 +43,7 @@ class MoebooruExtractor(BooruExtractor):
|
||||
post["tags_" + key] = " ".join(value)
|
||||
|
||||
def _notes(self, post, page):
|
||||
note_container = text.extract(page, 'id="note-container"', "<img ")[0]
|
||||
note_container = text.extr(page, 'id="note-container"', "<img ")
|
||||
if not note_container:
|
||||
return
|
||||
|
||||
|
@ -59,7 +59,7 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
|
||||
|
||||
def images(self, page):
|
||||
return [
|
||||
(text.unescape(text.extract(url, 'src="', '"')[0]).replace(
|
||||
(text.unescape(text.extr(url, 'src="', '"')).replace(
|
||||
"/thumbnail/", "/original/"), None)
|
||||
for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
|
||||
]
|
||||
|
@ -57,8 +57,8 @@ class MyportfolioGalleryExtractor(Extractor):
|
||||
raise exception.NotFoundError()
|
||||
page = response.text
|
||||
|
||||
projects = text.extract(
|
||||
page, '<section class="project-covers', '</section>')[0]
|
||||
projects = text.extr(
|
||||
page, '<section class="project-covers', '</section>')
|
||||
|
||||
if projects:
|
||||
data = {"_extractor": MyportfolioGalleryExtractor}
|
||||
|
@ -44,10 +44,10 @@ class NanaGalleryExtractor(GalleryExtractor):
|
||||
|
||||
def metadata(self, page):
|
||||
title = text.unescape(
|
||||
text.extract(page, '</a> ', '</div>')[0])
|
||||
artist = text.unescape(text.extract(
|
||||
page, '<title>', '</title>')[0])[len(title):-10]
|
||||
tags = text.extract(page, 'Reader.tags = "', '"')[0]
|
||||
text.extr(page, '</a> ', '</div>'))
|
||||
artist = text.unescape(text.extr(
|
||||
page, '<title>', '</title>'))[len(title):-10]
|
||||
tags = text.extr(page, 'Reader.tags = "', '"')
|
||||
|
||||
return {
|
||||
"gallery_id": self.gallery_id,
|
||||
@ -59,7 +59,7 @@ class NanaGalleryExtractor(GalleryExtractor):
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0])
|
||||
data = json.loads(text.extr(page, "Reader.pages = ", ".pages"))
|
||||
return [
|
||||
("https://nana.my.id" + image, None)
|
||||
for image in data["pages"]
|
||||
@ -108,8 +108,8 @@ class NanaSearchExtractor(Extractor):
|
||||
|
||||
for gallery in text.extract_iter(
|
||||
page, '<div class="id3">', '</div>'):
|
||||
url = "https://nana.my.id" + text.extract(
|
||||
gallery, '<a href="', '"')[0]
|
||||
url = "https://nana.my.id" + text.extr(
|
||||
gallery, '<a href="', '"')
|
||||
yield Message.Queue, url, data
|
||||
|
||||
self.params["p"] += 1
|
||||
|
@ -76,7 +76,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
|
||||
|
||||
@staticmethod
|
||||
def images(page):
|
||||
view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0]
|
||||
view_area = text.extr(page, 'id="comic_view_area"', '</div>')
|
||||
return [
|
||||
(url, None)
|
||||
for url in text.extract_iter(view_area, '<img src="', '"')
|
||||
|
@ -88,8 +88,8 @@ class NewgroundsExtractor(Extractor):
|
||||
return self.session.cookies
|
||||
|
||||
headers = {"Origin": self.root, "Referer": url}
|
||||
url = text.urljoin(self.root, text.extract(
|
||||
response.text, 'action="', '"')[0])
|
||||
url = text.urljoin(self.root, text.extr(
|
||||
response.text, 'action="', '"'))
|
||||
data = {
|
||||
"username": username,
|
||||
"password": password,
|
||||
@ -140,7 +140,7 @@ class NewgroundsExtractor(Extractor):
|
||||
data["score"] = text.parse_float(extr('id="score_number">', '<'))
|
||||
data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>'))
|
||||
data["artist"] = [
|
||||
text.extract(user, '//', '.')[0]
|
||||
text.extr(user, '//', '.')
|
||||
for user in text.extract_iter(page, '<div class="item-user">', '>')
|
||||
]
|
||||
|
||||
@ -275,7 +275,7 @@ class NewgroundsExtractor(Extractor):
|
||||
|
||||
for year, items in items.items():
|
||||
for item in items:
|
||||
page_url = text.extract(item, 'href="', '"')[0]
|
||||
page_url = text.extr(item, 'href="', '"')
|
||||
if page_url[0] == "/":
|
||||
page_url = self.root + page_url
|
||||
yield page_url
|
||||
|
@ -107,7 +107,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
|
||||
"""Extract image URLs from 'page'"""
|
||||
images = text.extract_iter(page, "/view_popup.php", "</a>")
|
||||
for num, image in enumerate(images):
|
||||
src = text.extract(image, 'src="', '"')[0]
|
||||
src = text.extr(image, 'src="', '"')
|
||||
if not src:
|
||||
continue
|
||||
url = ("https:" + src).replace("/__rs_l120x120/", "/")
|
||||
@ -118,7 +118,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
|
||||
|
||||
@staticmethod
|
||||
def _extract_user_name(page):
|
||||
return text.unescape(text.extract(page, "<br />", "<")[0] or "")
|
||||
return text.unescape(text.extr(page, "<br />", "<"))
|
||||
|
||||
def login(self):
|
||||
"""Login and obtain session cookies"""
|
||||
@ -322,8 +322,7 @@ class NijieNuitaExtractor(NijieExtractor):
|
||||
|
||||
@staticmethod
|
||||
def _extract_user_name(page):
|
||||
return text.unescape(text.extract(
|
||||
page, "<title>", "さんの抜いた")[0] or "")
|
||||
return text.unescape(text.extr(page, "<title>", "さんの抜いた"))
|
||||
|
||||
|
||||
class NijieFeedExtractor(NijieExtractor):
|
||||
|
@ -95,7 +95,7 @@ class PatreonExtractor(Extractor):
|
||||
if content:
|
||||
for img in text.extract_iter(
|
||||
content, '<img data-media-id="', '>'):
|
||||
url = text.extract(img, 'src="', '"')[0]
|
||||
url = text.extr(img, 'src="', '"')
|
||||
if url:
|
||||
yield "content", url, self._filename(url) or url
|
||||
|
||||
@ -181,7 +181,7 @@ class PatreonExtractor(Extractor):
|
||||
"""Fetch filename from an URL's Content-Disposition header"""
|
||||
response = self.request(url, method="HEAD", fatal=False)
|
||||
cd = response.headers.get("Content-Disposition")
|
||||
return text.extract(cd, 'filename="', '"')[0]
|
||||
return text.extr(cd, 'filename="', '"')
|
||||
|
||||
@staticmethod
|
||||
def _filehash(url):
|
||||
@ -284,7 +284,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
|
||||
url = "{}/{}/posts".format(self.root, self.creator)
|
||||
|
||||
page = self.request(url, notfound="creator").text
|
||||
campaign_id = text.extract(page, "/campaign/", "/")[0]
|
||||
campaign_id = text.extr(page, "/campaign/", "/")
|
||||
if not campaign_id:
|
||||
raise exception.NotFoundError("creator")
|
||||
|
||||
|
@ -75,7 +75,7 @@ class PhotobucketAlbumExtractor(Extractor):
|
||||
page = self.request(url, params=params).text
|
||||
json_data = text.extract(page, "collectionData:", ",\n")[0]
|
||||
if not json_data:
|
||||
msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]
|
||||
msg = text.extr(page, 'libraryPrivacyBlock">', "</div>")
|
||||
msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
|
||||
self.log.error("Unable to get JSON data%s", msg)
|
||||
return
|
||||
|
@ -98,7 +98,7 @@ class PillowfortExtractor(Extractor):
|
||||
|
||||
url = "https://www.pillowfort.social/users/sign_in"
|
||||
page = self.request(url).text
|
||||
auth = text.extract(page, 'name="authenticity_token" value="', '"')[0]
|
||||
auth = text.extr(page, 'name="authenticity_token" value="', '"')
|
||||
|
||||
headers = {"Origin": self.root, "Referer": url}
|
||||
data = {
|
||||
|
@ -638,7 +638,7 @@ class PixivPixivisionExtractor(PixivExtractor):
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
self.page = self.request(url, headers=headers).text
|
||||
|
||||
title = text.extract(self.page, '<title>', '<')[0]
|
||||
title = text.extr(self.page, '<title>', '<')
|
||||
return {
|
||||
"pixivision_id" : self.pixivision_id,
|
||||
"pixivision_title": text.unescape(title),
|
||||
@ -692,7 +692,7 @@ class PixivSeriesExtractor(PixivExtractor):
|
||||
series = body["extraData"]["meta"]
|
||||
series["id"] = self.series_id
|
||||
series["total"] = page["total"]
|
||||
series["title"] = text.extract(series["title"], '"', '"')[0]
|
||||
series["title"] = text.extr(series["title"], '"', '"')
|
||||
|
||||
for info in page["series"]:
|
||||
work = self.api.illust_detail(info["workId"])
|
||||
|
@ -30,7 +30,7 @@ class PixnetExtractor(Extractor):
|
||||
def items(self):
|
||||
url = self.url_fmt.format(self.root, self.item_id)
|
||||
page = self.request(url, encoding="utf-8").text
|
||||
user = text.extract(page, '<meta name="author" content="', '";')[0]
|
||||
user = text.extr(page, '<meta name="author" content="', '";')
|
||||
data = {
|
||||
"blog": self.blog,
|
||||
"user": user.rpartition(" (")[0],
|
||||
@ -52,13 +52,13 @@ class PixnetExtractor(Extractor):
|
||||
while True:
|
||||
yield from text.extract_iter(page, '<li id="', '</li>')
|
||||
|
||||
pnext = text.extract(page, 'class="nextBtn"', '>')[0]
|
||||
pnext = text.extr(page, 'class="nextBtn"', '>')
|
||||
if pnext is None and 'name="albumpass">' in page:
|
||||
raise exception.StopExtraction(
|
||||
"Album %s is password-protected.", self.item_id)
|
||||
if "href" not in pnext:
|
||||
return
|
||||
url = self.root + text.extract(pnext, 'href="', '"')[0]
|
||||
url = self.root + text.extr(pnext, 'href="', '"')
|
||||
page = self.request(url, encoding="utf-8").text
|
||||
|
||||
|
||||
|
@ -73,8 +73,8 @@ class PururinGalleryExtractor(GalleryExtractor):
|
||||
|
||||
url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
|
||||
page = self.request(url).text
|
||||
info = json.loads(binascii.a2b_base64(text.extract(
|
||||
page, '<gallery-read encoded="', '"')[0]).decode())
|
||||
info = json.loads(binascii.a2b_base64(text.extr(
|
||||
page, '<gallery-read encoded="', '"')).decode())
|
||||
self._ext = info["image_extension"]
|
||||
self._cnt = info["total_pages"]
|
||||
|
||||
|
@ -109,13 +109,13 @@ class ReactorExtractor(BaseExtractor):
|
||||
tags.sort()
|
||||
|
||||
for image in images:
|
||||
url = text.extract(image, ' src="', '"')[0]
|
||||
url = text.extr(image, ' src="', '"')
|
||||
if not url:
|
||||
continue
|
||||
if url.startswith("//"):
|
||||
url = "http:" + url
|
||||
width = text.extract(image, ' width="', '"')[0]
|
||||
height = text.extract(image, ' height="', '"')[0]
|
||||
width = text.extr(image, ' width="', '"')
|
||||
height = text.extr(image, ' height="', '"')
|
||||
image_id = url.rpartition("-")[2].partition(".")[0]
|
||||
num += 1
|
||||
|
||||
@ -125,7 +125,7 @@ class ReactorExtractor(BaseExtractor):
|
||||
url = url.replace("/post/", "/post/full/")
|
||||
|
||||
if self.gif and ("/post/webm/" in url or "/post/mp4/" in url):
|
||||
gif_url = text.extract(image, '<a href="', '"')[0]
|
||||
gif_url = text.extr(image, '<a href="', '"')
|
||||
if not gif_url:
|
||||
continue
|
||||
url = gif_url
|
||||
|
@ -306,7 +306,7 @@ class SankakuAPI():
|
||||
url = post["file_url"]
|
||||
if url:
|
||||
expires = text.parse_int(
|
||||
text.extract(url, "e=", "&")[0]) - 60
|
||||
text.extr(url, "e=", "&")) - 60
|
||||
|
||||
if 0 < expires <= time():
|
||||
self.extractor.log.debug("Refreshing download URLs")
|
||||
|
@ -43,7 +43,7 @@ class SexcomExtractor(Extractor):
|
||||
yield self.root + href
|
||||
|
||||
pager = extr('id="pagenum"', '</div>')
|
||||
url = text.extract(pager, ' href="', '"')[0]
|
||||
url = text.extr(pager, ' href="', '"')
|
||||
if not url:
|
||||
return
|
||||
url = text.urljoin(self.root, url)
|
||||
@ -71,7 +71,7 @@ class SexcomExtractor(Extractor):
|
||||
info = extr("player.updateSrc(", ");")
|
||||
|
||||
if info:
|
||||
path = text.extract(info, "src: '", "'")[0]
|
||||
path = text.extr(info, "src: '", "'")
|
||||
data["filename"] = path.rpartition("/")[2]
|
||||
data["extension"] = "mp4"
|
||||
if "'HD'" in info:
|
||||
@ -79,8 +79,8 @@ class SexcomExtractor(Extractor):
|
||||
data["url"] = self.root + path
|
||||
else:
|
||||
iframe = extr('<iframe', '>')
|
||||
src = (text.extract(iframe, ' src="', '"')[0] or
|
||||
text.extract(iframe, " src='", "'")[0])
|
||||
src = (text.extr(iframe, ' src="', '"') or
|
||||
text.extr(iframe, " src='", "'"))
|
||||
if not src:
|
||||
self.log.warning("Unable to fetch media from %s", url)
|
||||
return None
|
||||
|
@ -111,7 +111,7 @@ class SimplyhentaiImageExtractor(Extractor):
|
||||
url = extr('"image":"' , '&')
|
||||
url = extr(""content":"", "&") or url
|
||||
|
||||
tags = text.extract(descr, " tagged with ", " online for free ")[0]
|
||||
tags = text.extr(descr, " tagged with ", " online for free ")
|
||||
if tags:
|
||||
tags = tags.split(", ")
|
||||
tags[-1] = tags[-1].partition(" ")[2]
|
||||
@ -176,7 +176,7 @@ class SimplyhentaiVideoExtractor(Extractor):
|
||||
embed_url = text.extract(page, 'src="', '"', pos)[0].replace(
|
||||
"embedplayer.php?link=", "embed.php?name=")
|
||||
embed_page = self.request(embed_url).text
|
||||
video_url = text.extract(embed_page, '"file":"', '"')[0]
|
||||
video_url = text.extr(embed_page, '"file":"', '"')
|
||||
title, _, episode = title.rpartition(" Episode ")
|
||||
|
||||
if video_url.startswith("//"):
|
||||
|
@ -89,23 +89,23 @@ class SubscribestarExtractor(Extractor):
|
||||
def _media_from_post(html):
|
||||
media = []
|
||||
|
||||
gallery = text.extract(html, 'data-gallery="', '"')[0]
|
||||
gallery = text.extr(html, 'data-gallery="', '"')
|
||||
if gallery:
|
||||
media.extend(
|
||||
item for item in json.loads(text.unescape(gallery))
|
||||
if "/previews/" not in item["url"]
|
||||
)
|
||||
|
||||
attachments = text.extract(
|
||||
html, 'class="uploads-docs"', 'data-role="post-edit_form"')[0]
|
||||
attachments = text.extr(
|
||||
html, 'class="uploads-docs"', 'data-role="post-edit_form"')
|
||||
if attachments:
|
||||
for att in attachments.split('class="doc_preview"')[1:]:
|
||||
media.append({
|
||||
"id" : text.parse_int(text.extract(
|
||||
att, 'data-upload-id="', '"')[0]),
|
||||
"name": text.unescape(text.extract(
|
||||
att, 'doc_preview-title">', '<')[0] or ""),
|
||||
"url" : text.unescape(text.extract(att, 'href="', '"')[0]),
|
||||
"id" : text.parse_int(text.extr(
|
||||
att, 'data-upload-id="', '"')),
|
||||
"name": text.unescape(text.extr(
|
||||
att, 'doc_preview-title">', '<')),
|
||||
"url" : text.unescape(text.extr(att, 'href="', '"')),
|
||||
"type": "attachment",
|
||||
})
|
||||
|
||||
@ -175,7 +175,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
|
||||
return
|
||||
yield from posts
|
||||
|
||||
url = text.extract(posts[-1], needle_next_page, '"')[0]
|
||||
url = text.extr(posts[-1], needle_next_page, '"')
|
||||
if not url:
|
||||
return
|
||||
page = self.request(self.root + text.unescape(url)).json()["html"]
|
||||
|
@ -257,7 +257,7 @@ class TumblrExtractor(Extractor):
|
||||
except Exception:
|
||||
return resized, True
|
||||
else:
|
||||
updated = text.extract(response.text, '" src="', '"')[0]
|
||||
updated = text.extr(response.text, '" src="', '"')
|
||||
return updated, (resized == updated)
|
||||
|
||||
def _original_image_fallback(self, url, post_id):
|
||||
|
@ -46,7 +46,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
|
||||
|
||||
def metadata(self, page):
|
||||
return {
|
||||
"title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0],
|
||||
"title" : text.unescape(text.extr(page, "<h1>", "</h1>")),
|
||||
"gallery_id": self.gallery_id,
|
||||
}
|
||||
|
||||
@ -82,7 +82,7 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
|
||||
def metadata(self, page):
|
||||
return {
|
||||
"title" : text.remove_html(
|
||||
text.unescape(text.extract(page, "<title>", "</title>")[0])
|
||||
text.unescape(text.extr(page, "<title>", "</title>"))
|
||||
).replace("_", "-"),
|
||||
"gallery_id": self.gallery_id,
|
||||
}
|
||||
@ -127,12 +127,12 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
|
||||
data = self._data_from_url(url)
|
||||
data["gallery_id"] = gallery_id
|
||||
data["title"] = text.remove_html(text.unescape(
|
||||
text.extract(post_page, "<title>", "</title>")[0]
|
||||
text.extr(post_page, "<title>", "</title>")
|
||||
)).replace("_", "-")
|
||||
yield url, data
|
||||
|
||||
next_url = text.extract(
|
||||
page, '</span> <a class="btn btn-primary" href="', '"')[0]
|
||||
next_url = text.extr(
|
||||
page, '</span> <a class="btn btn-primary" href="', '"')
|
||||
if not next_url or page_url == next_url:
|
||||
return
|
||||
page_url = next_url
|
||||
|
@ -227,8 +227,8 @@ class TwitterExtractor(Extractor):
|
||||
response = self.request(url, fatal=False)
|
||||
if response.status_code >= 400:
|
||||
continue
|
||||
url = text.extract(
|
||||
response.text, 'name="twitter:image" value="', '"')[0]
|
||||
url = text.extr(
|
||||
response.text, 'name="twitter:image" value="', '"')
|
||||
if url:
|
||||
files.append({"url": url})
|
||||
|
||||
|
@ -44,7 +44,7 @@ class VanillarockPostExtractor(VanillarockExtractor):
|
||||
img = extr('<div class="main-img">', '</div>')
|
||||
if not img:
|
||||
break
|
||||
imgs.append(text.extract(img, 'href="', '"')[0])
|
||||
imgs.append(text.extr(img, 'href="', '"'))
|
||||
|
||||
data = {
|
||||
"count": len(imgs),
|
||||
@ -89,5 +89,5 @@ class VanillarockTagExtractor(VanillarockExtractor):
|
||||
post = extr('<h2 class="entry-title">', '</h2>')
|
||||
if not post:
|
||||
break
|
||||
yield Message.Queue, text.extract(post, 'href="', '"')[0], data
|
||||
yield Message.Queue, text.extr(post, 'href="', '"'), data
|
||||
url = text.unescape(extr('class="next page-numbers" href="', '"'))
|
||||
|
@ -69,7 +69,7 @@ class VscoExtractor(Extractor):
|
||||
|
||||
def _extract_preload_state(self, url):
|
||||
page = self.request(url, notfound=self.subcategory).text
|
||||
return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0])
|
||||
return json.loads(text.extr(page, "__PRELOADED_STATE__ = ", "<"))
|
||||
|
||||
def _pagination(self, url, params, token, key, extra=None):
|
||||
headers = {
|
||||
|
@ -57,8 +57,8 @@ class WarosuThreadExtractor(Extractor):
|
||||
|
||||
def get_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
boardname = text.extract(page, "<title>", "</title>")[0]
|
||||
title = text.extract(page, 'filetitle" itemprop="name">', '<')[0]
|
||||
boardname = text.extr(page, "<title>", "</title>")
|
||||
title = text.extr(page, 'filetitle" itemprop="name">', '<')
|
||||
return {
|
||||
"board": self.board,
|
||||
"board_name": boardname.rpartition(" - ")[2],
|
||||
@ -68,7 +68,7 @@ class WarosuThreadExtractor(Extractor):
|
||||
|
||||
def posts(self, page):
|
||||
"""Build a list of all post-objects"""
|
||||
page = text.extract(page, '<div class="content">', '<table>')[0]
|
||||
page = text.extr(page, '<div class="content">', '<table>')
|
||||
needle = '<table itemscope itemtype="http://schema.org/Comment">'
|
||||
return [self.parse(post) for post in page.split(needle)]
|
||||
|
||||
|
@ -225,7 +225,7 @@ class WeasylFavoriteExtractor(WeasylExtractor):
|
||||
pos = page.index('id="favorites-content"')
|
||||
|
||||
if not owner_login:
|
||||
owner_login = text.extract(page, '<a href="/~', '"')[0]
|
||||
owner_login = text.extr(page, '<a href="/~', '"')
|
||||
|
||||
for submitid in text.extract_iter(page, "/submissions/", "/", pos):
|
||||
if submitid == lastid:
|
||||
|
@ -169,7 +169,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
|
||||
@staticmethod
|
||||
def get_episode_urls(page):
|
||||
"""Extract and return all episode urls in 'page'"""
|
||||
page = text.extract(page, 'id="_listUl"', '</ul>')[0]
|
||||
page = text.extr(page, 'id="_listUl"', '</ul>')
|
||||
return [
|
||||
match.group(0)
|
||||
for match in WebtoonsEpisodeExtractor.pattern.finditer(page)
|
||||
|
@ -173,7 +173,7 @@ class WeiboExtractor(Extractor):
|
||||
|
||||
page = Extractor.request(
|
||||
self, passport_url, method="POST", headers=headers, data=data).text
|
||||
data = json.loads(text.extract(page, "(", ");")[0])["data"]
|
||||
data = json.loads(text.extr(page, "(", ");"))["data"]
|
||||
|
||||
passport_url = "https://passport.weibo.com/visitor/visitor"
|
||||
params = {
|
||||
|
@ -144,8 +144,8 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
|
||||
|
||||
def _data(self, url):
|
||||
page = self.request(url).text
|
||||
return json.loads(text.extract(
|
||||
page, "window.initials=", "</script>")[0].rstrip("\n\r;"))
|
||||
return json.loads(text.extr(
|
||||
page, "window.initials=", "</script>").rstrip("\n\r;"))
|
||||
|
||||
|
||||
class XhamsterUserExtractor(XhamsterExtractor):
|
||||
|
@ -113,8 +113,8 @@ class XvideosUserExtractor(XvideosBase, Extractor):
|
||||
def items(self):
|
||||
url = "{}/profiles/{}".format(self.root, self.user)
|
||||
page = self.request(url, notfound=self.subcategory).text
|
||||
data = json.loads(text.extract(
|
||||
page, "xv.conf=", ";</script>")[0])["data"]
|
||||
data = json.loads(text.extr(
|
||||
page, "xv.conf=", ";</script>"))["data"]
|
||||
|
||||
if not isinstance(data["galleries"], dict):
|
||||
return
|
||||
|
@ -127,7 +127,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
|
||||
|
||||
while True:
|
||||
page = self.request(url, params=params).text
|
||||
thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0]
|
||||
thumbs = text.extr(page, '<ul id="thumbs', '</ul>')
|
||||
extr = text.extract_from(thumbs)
|
||||
|
||||
while True:
|
||||
|
Loading…
x
Reference in New Issue
Block a user