From 387fe415d5d3de5ca334aed7abf6b8e79e4f4736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 29 Mar 2021 02:12:29 +0200 Subject: [PATCH] unescape items in text.split_html() --- gallery_dl/extractor/aryion.py | 4 ++-- gallery_dl/extractor/dynastyscans.py | 7 +++---- gallery_dl/text.py | 7 ++++--- test/test_text.py | 4 ++++ 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 2056d469..ded2ae38 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -126,8 +126,8 @@ class AryionExtractor(Extractor): "user" : self.user or artist, "title" : title, "artist": artist, - "path" : text.split_html(text.unescape(extr( - "cookiecrumb'>", '", ':", "<").replace(",", "")), diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 7d26c476..67051c92 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -1,19 +1,18 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters from https://dynasty-scans.com/""" +"""Extractors for https://dynasty-scans.com/""" from .common import ChapterExtractor, Extractor, Message from .. import text import json import re - BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -36,7 +35,7 @@ class DynastyscansBase(): return { "url" : self.root + url, "image_id": text.parse_int(image_id), - "tags" : text.split_html(text.unescape(tags)), + "tags" : text.split_html(tags), "date" : text.remove_html(date), "source" : text.unescape(src), } diff --git a/gallery_dl/text.py b/gallery_dl/text.py index e20aa515..a6a9105b 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -27,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "): return txt.strip() -def split_html(txt, sep=None): - """Split input string by html-tags""" +def split_html(txt): + """Split input string by HTML tags""" try: return [ - x.strip() for x in HTML_RE.split(txt) + unescape(x).strip() + for x in HTML_RE.split(txt) if x and not x.isspace() ] except TypeError: diff --git a/test/test_text.py b/test/test_text.py index 675a04cf..1daefde8 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -59,6 +59,10 @@ class TestText(unittest.TestCase): self.assertEqual( f("
HelloWorld.
"), result) + # escaped HTML entities + self.assertEqual( + f("<foo> <bar> "), ["", ""]) + # empty HTML self.assertEqual(f("
"), empty) self.assertEqual(f("
"), empty)