1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 02:32:33 +01:00

[exhentai] use text.extract_all

This commit is contained in:
Mike Fährmann 2015-11-03 00:10:30 +01:00
parent 1fa6a99f18
commit 353ac1e00b
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -10,7 +10,6 @@
from .common import Extractor, Message
from .. import config, text
import re
import os.path
import time
import random
@ -27,7 +26,7 @@ info = {
class ExhentaiExtractor(Extractor):
api_url = "http://exhentai.org/api.php"
api_url = "http://exhentai.org/api.php"
def __init__(self, match):
Extractor.__init__(self)
@ -64,39 +63,43 @@ class ExhentaiExtractor(Extractor):
image["name"] = name
image["extension"] = ext[1:]
if "/fullimg.php" in image[urlkey]:
time.sleep( random.uniform(1, 2) )
time.sleep(random.uniform(1, 2))
yield Message.Url, image[urlkey], image
def get_job_metadata(self, page):
title , pos = text.extract(page, '<h1 id="gn">', '</h1>')
title_jp, pos = text.extract(page, '<h1 id="gj">', '</h1>', pos)
date , pos = text.extract(page, '>Posted:</td><td class="gdt2">', '</td>', pos)
language, pos = text.extract(page, '>Language:</td><td class="gdt2">', '</td>', pos)
size , pos = text.extract(page, '>File Size:</td><td class="gdt2">', ' ', pos)
url , pos = text.extract(page, 'hentai.org/s/', '"', pos)
return {
"category": info["category"],
"gallery-id": self.gid,
"""Collect metadata for extractor-job"""
data = {
"category" : info["category"],
"gallery-id" : self.gid,
"gallery-token": self.token,
"title": title,
"title-jp": title_jp,
"date": date,
"language": language,
"size": size,
}, "http://exhentai.org/s/" + url
}
data, _ = text.extract_all(page, (
("title" , '<h1 id="gn">', '</h1>'),
("title_jp", '<h1 id="gj">', '</h1>'),
("date" , '>Posted:</td><td class="gdt2">', '</td>'),
("language", '>Language:</td><td class="gdt2">', '</td>'),
("size" , '>File Size:</td><td class="gdt2">', ' '),
("count" , '>Length:</td><td class="gdt2">', ' '),
("url" , 'hentai.org/s/', '"'),
), values=data)
url = "http://exhentai.org/s/" + data["url"]
del data["url"]
return data, url
def get_images(self, url):
time.sleep( random.uniform(3, 6) )
"""Collect url and metadata for all images in this gallery"""
time.sleep(random.uniform(3, 6))
page = self.request(url).text
data = {}
_ , pos = text.extract(page, '<div id="i3"><a onclick="return load_image(', '')
data["imgkey"] , pos = text.extract(page, "'", "'", pos)
data["url"] , pos = text.extract(page, '<img id="img" src="', '"', pos)
data["title"] , pos = text.extract(page, '<div id="i4"><div>', ' :: ', pos)
data["origurl"] , pos = text.extract(page, 'http://exhentai.org/fullimg.php', '"', pos)
data["gid"] , pos = text.extract(page, 'var gid=' , ';', pos)
data["startkey"], pos = text.extract(page, 'var startkey="', '";', pos)
data["showkey"] , pos = text.extract(page, 'var showkey="' , '";', pos)
data, pos = text.extract_all(page, (
(None , '<div id="i3"><a onclick="return load_image(', ''),
("imgkey" , "'", "'"),
("url" , '<img id="img" src="', '"'),
("title" , '<div id="i4"><div>', ' :: '),
("origurl" , 'http://exhentai.org/fullimg.php', '"'),
("gid" , 'var gid=', ';'),
("startkey", 'var startkey="', '";'),
("showkey" , 'var showkey="', '";'),
))
if data["origurl"]:
data["origurl"] = "http://exhentai.org/fullimg.php" + text.unescape(data["origurl"])
else:
@ -111,10 +114,7 @@ class ExhentaiExtractor(Extractor):
"showkey": data["showkey"],
}
while True:
time.sleep( random.uniform(3, 6) )
# page = safe_request(
# self.session, self.api_url, method="POST", json=request
# ).json
time.sleep(random.uniform(3, 6))
page = self.session.post(self.api_url, json=request).json()
data["imgkey"] , pos = text.extract(page["i3"], "'", "'")
data["url"] , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos)