mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-22 02:32:33 +01:00
unescape items in text.split_html()
This commit is contained in:
parent
36291176bc
commit
387fe415d5
@ -126,8 +126,8 @@ class AryionExtractor(Extractor):
|
||||
"user" : self.user or artist,
|
||||
"title" : title,
|
||||
"artist": artist,
|
||||
"path" : text.split_html(text.unescape(extr(
|
||||
"cookiecrumb'>", '</span')))[4:-1:2],
|
||||
"path" : text.split_html(extr(
|
||||
"cookiecrumb'>", '</span'))[4:-1:2],
|
||||
"date" : extr("class='pretty-date' title='", "'"),
|
||||
"size" : text.parse_int(clen),
|
||||
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
|
||||
|
@ -1,19 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2019 Mike Fährmann
|
||||
# Copyright 2015-2021 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract manga-chapters from https://dynasty-scans.com/"""
|
||||
"""Extractors for https://dynasty-scans.com/"""
|
||||
|
||||
from .common import ChapterExtractor, Extractor, Message
|
||||
from .. import text
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
|
||||
|
||||
|
||||
@ -36,7 +35,7 @@ class DynastyscansBase():
|
||||
return {
|
||||
"url" : self.root + url,
|
||||
"image_id": text.parse_int(image_id),
|
||||
"tags" : text.split_html(text.unescape(tags)),
|
||||
"tags" : text.split_html(tags),
|
||||
"date" : text.remove_html(date),
|
||||
"source" : text.unescape(src),
|
||||
}
|
||||
|
@ -27,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
|
||||
return txt.strip()
|
||||
|
||||
|
||||
def split_html(txt, sep=None):
|
||||
"""Split input string by html-tags"""
|
||||
def split_html(txt):
|
||||
"""Split input string by HTML tags"""
|
||||
try:
|
||||
return [
|
||||
x.strip() for x in HTML_RE.split(txt)
|
||||
unescape(x).strip()
|
||||
for x in HTML_RE.split(txt)
|
||||
if x and not x.isspace()
|
||||
]
|
||||
except TypeError:
|
||||
|
@ -59,6 +59,10 @@ class TestText(unittest.TestCase):
|
||||
self.assertEqual(
|
||||
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
|
||||
|
||||
# escaped HTML entities
|
||||
self.assertEqual(
|
||||
f("<i><foo></i> <i><bar> </i>"), ["<foo>", "<bar>"])
|
||||
|
||||
# empty HTML
|
||||
self.assertEqual(f("<div></div>"), empty)
|
||||
self.assertEqual(f(" <div> </div> "), empty)
|
||||
|
Loading…
Reference in New Issue
Block a user