1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2025-01-31 19:51:34 +01:00

[generic] add support for IDNs

(internationalized domain name)
This commit is contained in:
ClosedPort22 2023-03-06 19:51:25 +08:00
parent 7610d9cf82
commit 34a7fab0e2
No known key found for this signature in database
2 changed files with 28 additions and 2 deletions

View File

@ -44,6 +44,10 @@ class DirectlinkExtractor(Extractor):
("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"
".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"
"mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),
# internationalized domain name
("https://räksmörgås.josefsson.org/raksmorgas.jpg", {
"content": "f7e00768ab009c969e70d775047cdd302ca51762",
}),
)
def __init__(self, match):

View File

@ -26,12 +26,34 @@ class GenericExtractor(Extractor):
# Based on: https://tools.ietf.org/html/rfc3986#appendix-B
pattern += r"""
(?P<scheme>https?://)? # optional http(s) scheme
(?P<domain>[-\w\.]+) # required domain
(?P<domain>[^/?#]+) # required domain
(?P<path>/[^?#]*)? # optional path
(?:\?(?P<query>[^#]*))? # optional query
(?:\#(?P<fragment>.*))? # optional fragment
"""
test = (
("generic:https://www.nongnu.org/lzip/", {
"count": 1,
"content": "40be5c77773d3e91db6e1c5df720ee30afb62368",
"keyword": {
"description": "Lossless data compressor",
"imageurl": "https://www.nongnu.org/lzip/lzip.png",
"keywords": "lzip, clzip, plzip, lzlib, LZMA, bzip2, "
"gzip, data compression, GNU, free software",
"pageurl": "https://www.nongnu.org/lzip/",
},
}),
# internationalized domain name
("generic:https://räksmörgås.josefsson.org/", {
"count": 2,
"pattern": "^https://räksmörgås.josefsson.org/",
}),
("generic:https://en.wikipedia.org/Main_Page"),
("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"),
("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"),
)
def __init__(self, match):
"""Init."""
Extractor.__init__(self, match)
@ -56,7 +78,7 @@ class GenericExtractor(Extractor):
self.root = self.scheme + match.group('domain')
def items(self):
"""Get page, extract metadata & images, yield them in suitable messages.
"""Get page, extract metadata & images, yield them in suitable messages
Adapted from common.GalleryExtractor.items()