unescape items in text.split_html()

2024-11-22 02:32:33 +01:00 · 2021-03-29 02:12:29 +02:00 · 2021-03-29 02:12:29 +02:00 · 387fe415d5
commit 387fe415d5
parent 36291176bc
4 changed files with 13 additions and 9 deletions
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@ -126,8 +126,8 @@ class AryionExtractor(Extractor):
            "user"  : self.user or artist,
            "title" : title,
            "artist": artist,
-            "path"  : text.split_html(text.unescape(extr(
-                "cookiecrumb'>", '</span')))[4:-1:2],
+            "path"  : text.split_html(extr(
+                "cookiecrumb'>", '</span'))[4:-1:2],
            "date"  : extr("class='pretty-date' title='", "'"),
            "size"  : text.parse_int(clen),
            "views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@ -1,19 +1,18 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-"""Extract manga-chapters from https://dynasty-scans.com/"""
+"""Extractors for https://dynasty-scans.com/"""

 from .common import ChapterExtractor, Extractor, Message
 from .. import text
 import json
 import re

-
 BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"


@ -36,7 +35,7 @@ class DynastyscansBase():
        return {
            "url"     : self.root + url,
            "image_id": text.parse_int(image_id),
-            "tags"    : text.split_html(text.unescape(tags)),
+            "tags"    : text.split_html(tags),
            "date"    : text.remove_html(date),
            "source"  : text.unescape(src),
        }
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@ -27,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
    return txt.strip()


-def split_html(txt, sep=None):
-    """Split input string by html-tags"""
+def split_html(txt):
+    """Split input string by HTML tags"""
    try:
        return [
-            x.strip() for x in HTML_RE.split(txt)
+            unescape(x).strip()
+            for x in HTML_RE.split(txt)
            if x and not x.isspace()
        ]
    except TypeError:
--- a/test/test_text.py
+++ b/test/test_text.py
@ -59,6 +59,10 @@ class TestText(unittest.TestCase):
        self.assertEqual(
            f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)

+        # escaped HTML entities
+        self.assertEqual(
+            f("<i>&lt;foo&gt;</i> <i>&lt;bar&gt; </i>"), ["<foo>", "<bar>"])
+
        # empty HTML
        self.assertEqual(f("<div></div>"), empty)
        self.assertEqual(f(" <div>   </div> "), empty)