2023-03-28 18:06:41 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2023-04-15 18:06:06 +02:00
|
|
|
"""Extractors for general-purpose URL shorteners"""
|
2023-03-28 18:06:41 +02:00
|
|
|
|
|
|
|
from .common import BaseExtractor, Message
|
|
|
|
from .. import exception
|
|
|
|
|
|
|
|
|
|
|
|
class UrlshortenerExtractor(BaseExtractor):
|
2023-04-15 18:06:06 +02:00
|
|
|
"""Base class for URL shortener extractors"""
|
2023-03-28 18:06:41 +02:00
|
|
|
basecategory = "urlshortener"
|
2023-04-15 18:06:06 +02:00
|
|
|
|
|
|
|
|
2024-01-18 03:20:36 +01:00
|
|
|
BASE_PATTERN = UrlshortenerExtractor.update({
|
2023-04-15 18:06:06 +02:00
|
|
|
"bitly": {
|
|
|
|
"root": "https://bit.ly",
|
|
|
|
"pattern": r"bit\.ly",
|
|
|
|
},
|
|
|
|
"tco": {
|
|
|
|
# t.co sends 'http-equiv="refresh"' (200) when using browser UA
|
|
|
|
"headers": {"User-Agent": None},
|
|
|
|
"root": "https://t.co",
|
|
|
|
"pattern": r"t\.co",
|
|
|
|
},
|
2024-01-18 03:20:36 +01:00
|
|
|
})
|
2023-04-15 18:06:06 +02:00
|
|
|
|
|
|
|
|
|
|
|
class UrlshortenerLinkExtractor(UrlshortenerExtractor):
|
|
|
|
"""Extractor for general-purpose URL shorteners"""
|
|
|
|
subcategory = "link"
|
2023-05-03 20:26:25 +02:00
|
|
|
pattern = BASE_PATTERN + r"/([^/?#]+)"
|
2023-09-11 16:30:55 +02:00
|
|
|
example = "https://bit.ly/abcde"
|
2023-03-28 18:06:41 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2023-04-15 18:06:06 +02:00
|
|
|
UrlshortenerExtractor.__init__(self, match)
|
2023-04-09 12:06:42 +02:00
|
|
|
self.id = match.group(match.lastindex)
|
2023-03-28 18:06:41 +02:00
|
|
|
|
2023-07-25 20:09:44 +02:00
|
|
|
def _init(self):
|
2024-01-18 03:20:36 +01:00
|
|
|
self.headers = self.config_instance("headers")
|
2023-03-28 18:06:41 +02:00
|
|
|
|
|
|
|
def items(self):
|
|
|
|
response = self.request(
|
2023-04-15 18:06:06 +02:00
|
|
|
"{}/{}".format(self.root, self.id), headers=self.headers,
|
|
|
|
method="HEAD", allow_redirects=False, notfound="URL")
|
|
|
|
try:
|
|
|
|
yield Message.Queue, response.headers["location"], {}
|
|
|
|
except KeyError:
|
2023-03-28 18:06:41 +02:00
|
|
|
raise exception.StopExtraction("Unable to resolve short URL")
|