1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 02:32:33 +01:00

[deviantart] extract journal HTML from webpage (#6254, #6207, #6196)

This commit is contained in:
Mike Fährmann 2024-10-01 21:08:19 +02:00
parent fb6be2dd34
commit 7dbd53e9b4
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -369,15 +369,30 @@ class DeviantartExtractor(Extractor):
else:
page = self._limited_request(deviation["url"]).text
# extract journal html from webpage
html = text.extr(
page,
"<h2>Literature Text</h2></span><div>",
"</div></section>")
if html:
return {"html": html}
self.log.warning("%s: Failed to extract journal HTML from "
"webpage. Falling back to __INITIAL_STATE__ "
"markup.", deviation["index"])
# parse __INITIAL_STATE__ as fallback
state = util.json_loads(text.extr(
page, 'window.__INITIAL_STATE__ = JSON.parse("', '");')
.replace("\\\\", "\\").replace("\\'", "'").replace('\\"', '"'))
deviation = state["@@entities"]["deviation"].popitem()[1]
content = deviation["textContent"]
deviations = state["@@entities"]["deviation"]
content = deviations.popitem()[1]["textContent"]
html = content["html"]["markup"]
if html.startswith("{"):
self.log.warning("%s: Unsupported '%s' markup.",
deviation["index"], content["html"]["type"])
html = content["excerpt"].replace("\n", "<br />")
return {"html": html}