From 387fe415d5d3de5ca334aed7abf6b8e79e4f4736 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Mon, 29 Mar 2021 02:12:29 +0200
Subject: [PATCH] unescape items in text.split_html()

---
 gallery_dl/extractor/aryion.py       | 4 ++--
 gallery_dl/extractor/dynastyscans.py | 7 +++----
 gallery_dl/text.py                   | 7 ++++---
 test/test_text.py                    | 4 ++++
 4 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index 2056d469..ded2ae38 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -126,8 +126,8 @@ class AryionExtractor(Extractor):
             "user"  : self.user or artist,
             "title" : title,
             "artist": artist,
-            "path"  : text.split_html(text.unescape(extr(
-                "cookiecrumb'>", '</span')))[4:-1:2],
+            "path"  : text.split_html(extr(
+                "cookiecrumb'>", '</span'))[4:-1:2],
             "date"  : extr("class='pretty-date' title='", "'"),
             "size"  : text.parse_int(clen),
             "views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index 7d26c476..67051c92 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -1,19 +1,18 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extract manga-chapters from https://dynasty-scans.com/"""
+"""Extractors for https://dynasty-scans.com/"""
 
 from .common import ChapterExtractor, Extractor, Message
 from .. import text
 import json
 import re
 
-
 BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
 
 
@@ -36,7 +35,7 @@ class DynastyscansBase():
         return {
             "url"     : self.root + url,
             "image_id": text.parse_int(image_id),
-            "tags"    : text.split_html(text.unescape(tags)),
+            "tags"    : text.split_html(tags),
             "date"    : text.remove_html(date),
             "source"  : text.unescape(src),
         }
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index e20aa515..a6a9105b 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -27,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
     return txt.strip()
 
 
-def split_html(txt, sep=None):
-    """Split input string by html-tags"""
+def split_html(txt):
+    """Split input string by HTML tags"""
     try:
         return [
-            x.strip() for x in HTML_RE.split(txt)
+            unescape(x).strip()
+            for x in HTML_RE.split(txt)
             if x and not x.isspace()
         ]
     except TypeError:
diff --git a/test/test_text.py b/test/test_text.py
index 675a04cf..1daefde8 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -59,6 +59,10 @@ class TestText(unittest.TestCase):
         self.assertEqual(
             f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
 
+        # escaped HTML entities
+        self.assertEqual(
+            f("<i>&lt;foo&gt;</i> <i>&lt;bar&gt; </i>"), ["<foo>", "<bar>"])
+
         # empty HTML
         self.assertEqual(f("<div></div>"), empty)
         self.assertEqual(f(" <div>   </div> "), empty)