mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-22 10:42:34 +01:00
[imagebam] rewrite/fix
This commit is contained in:
parent
8a07ccfc6e
commit
c0efea339e
@ -10,14 +10,15 @@
|
|||||||
|
|
||||||
from .common import AsynchronousExtractor, Message
|
from .common import AsynchronousExtractor, Message
|
||||||
from .. import text
|
from .. import text
|
||||||
|
import os.path
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
"category": "imagebam",
|
"category": "imagebam",
|
||||||
"extractor": "ImagebamExtractor",
|
"extractor": "ImagebamExtractor",
|
||||||
"directory": ["{category}", "{title} - {key}"],
|
"directory": ["{category}", "{title} - {gallery-key}"],
|
||||||
"filename": "{num:>03}-{name}",
|
"filename": "{num:>03}-{filename}",
|
||||||
"pattern": [
|
"pattern": [
|
||||||
r"(?:https?://)?(?:www\.)?imagebam\.com/(gallery)/([^/]+).*",
|
r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+).*",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -27,54 +28,48 @@ class ImagebamExtractor(AsynchronousExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
AsynchronousExtractor.__init__(self)
|
AsynchronousExtractor.__init__(self)
|
||||||
self.match = match
|
self.gkey = match.group(1)
|
||||||
self.num = 0
|
|
||||||
self.metadata = {}
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
self.num = 0
|
data = self.get_job_metadata()
|
||||||
self.metadata = self.get_job_metadata()
|
data["num"] = 0
|
||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
yield Message.Directory, self.metadata
|
yield Message.Directory, data
|
||||||
|
for image_url, image_id in self.get_images(data["first-url"]):
|
||||||
next_url = self.metadata["first-url"]
|
data["id"] = image_id
|
||||||
done = False
|
data["filename"] = text.unquote(text.filename_from_url(image_url))
|
||||||
while not done:
|
name, ext = os.path.splitext(data["filename"])
|
||||||
# get current page
|
data["num"] += 1
|
||||||
page = self.request(self.url_base + next_url).text
|
data["name"] = name
|
||||||
|
data["extension"] = ext[1:]
|
||||||
# get url for next page
|
yield Message.Url, image_url, data.copy()
|
||||||
next_url, pos = text.extract(page, "<a class='buttonblue' href='", "'")
|
|
||||||
|
|
||||||
# if the following text isn't "><span>next image" we are done
|
|
||||||
if not page.startswith("><span>next image", pos):
|
|
||||||
done = True
|
|
||||||
|
|
||||||
# get image url
|
|
||||||
img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)
|
|
||||||
|
|
||||||
yield Message.Url, img_url, self.get_file_metadata(img_url)
|
|
||||||
|
|
||||||
def get_job_metadata(self):
|
def get_job_metadata(self):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
gallery_key = self.match.group(2)
|
response = self.request(self.url_base + "/gallery/" + self.gkey)
|
||||||
page = self.request(self.url_base + "/gallery/" + gallery_key).text
|
response.encoding = "utf-8"
|
||||||
_ , pos = text.extract(page, "<img src='/img/icons/photos.png'", "")
|
page = response.text
|
||||||
title, pos = text.extract(page, "'> ", " <", pos)
|
data = {
|
||||||
count, pos = text.extract(page, "'>", " images", pos)
|
|
||||||
url , pos = text.extract(page, "<a href='http://www.imagebam.com", "'", pos)
|
|
||||||
return {
|
|
||||||
"category": info["category"],
|
"category": info["category"],
|
||||||
"key": gallery_key,
|
"gallery-key": self.gkey,
|
||||||
"title": title,
|
|
||||||
"count": count,
|
|
||||||
"first-url": url,
|
|
||||||
}
|
}
|
||||||
|
data, _ = text.extract_all(page, (
|
||||||
def get_file_metadata(self, url):
|
(None , "<img src='/img/icons/photos.png'", ""),
|
||||||
"""Collect metadata for a downloadable file"""
|
("title" , "'> ", " <"),
|
||||||
self.num += 1
|
("count" , "'>", " images"),
|
||||||
data = self.metadata.copy()
|
("first-url", "<a href='http://www.imagebam.com", "'"),
|
||||||
data["num"] = self.num
|
), values=data)
|
||||||
data["name"] = text.filename_from_url(url)
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def get_images(self, url):
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
page = self.request(self.url_base + url).text
|
||||||
|
_ , pos = text.extract(page, 'class="btn btn-default" title="Next">', '')
|
||||||
|
if pos == 0:
|
||||||
|
done = True
|
||||||
|
else:
|
||||||
|
url, pos = text.extract(page, ' href="', '"', pos-70)
|
||||||
|
image_id , pos = text.extract(page, '<img class="image" id="', '"', pos)
|
||||||
|
image_url, pos = text.extract(page, ' src="', '"', pos)
|
||||||
|
yield image_url, image_id
|
||||||
|
Loading…
Reference in New Issue
Block a user