1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 02:32:33 +01:00

unescape items in text.split_html()

This commit is contained in:
Mike Fährmann 2021-03-29 02:12:29 +02:00
parent 36291176bc
commit 387fe415d5
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 13 additions and 9 deletions

View File

@ -126,8 +126,8 @@ class AryionExtractor(Extractor):
"user" : self.user or artist,
"title" : title,
"artist": artist,
"path" : text.split_html(text.unescape(extr(
"cookiecrumb'>", '</span')))[4:-1:2],
"path" : text.split_html(extr(
"cookiecrumb'>", '</span'))[4:-1:2],
"date" : extr("class='pretty-date' title='", "'"),
"size" : text.parse_int(clen),
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),

View File

@ -1,19 +1,18 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2019 Mike Fährmann
# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract manga-chapters from https://dynasty-scans.com/"""
"""Extractors for https://dynasty-scans.com/"""
from .common import ChapterExtractor, Extractor, Message
from .. import text
import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
@ -36,7 +35,7 @@ class DynastyscansBase():
return {
"url" : self.root + url,
"image_id": text.parse_int(image_id),
"tags" : text.split_html(text.unescape(tags)),
"tags" : text.split_html(tags),
"date" : text.remove_html(date),
"source" : text.unescape(src),
}

View File

@ -27,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
return txt.strip()
def split_html(txt, sep=None):
"""Split input string by html-tags"""
def split_html(txt):
"""Split input string by HTML tags"""
try:
return [
x.strip() for x in HTML_RE.split(txt)
unescape(x).strip()
for x in HTML_RE.split(txt)
if x and not x.isspace()
]
except TypeError:

View File

@ -59,6 +59,10 @@ class TestText(unittest.TestCase):
self.assertEqual(
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
# escaped HTML entities
self.assertEqual(
f("<i>&lt;foo&gt;</i> <i>&lt;bar&gt; </i>"), ["<foo>", "<bar>"])
# empty HTML
self.assertEqual(f("<div></div>"), empty)
self.assertEqual(f(" <div> </div> "), empty)