merge #3841: [urlshortener] add support for bit.ly & t.co

2024-11-25 04:02:32 +01:00 · 2023-04-15 18:08:21 +02:00 · 2023-04-15 18:08:21 +02:00 · d253a3c542
commit d253a3c542
parent 2edcdee32f 5e63942b37
5 changed files with 93 additions and 0 deletions
--- a/docs/gallery-dl-example.conf
+++ b/docs/gallery-dl-example.conf
@ -317,6 +317,10 @@
            "archive": "~/gallery-dl/custom-archive-file-for-TBIB.db",
            "filename": "{id}_{md5}.{extension}",
            "sleep-request": [0, 1.2]
+        },
+
+        "urlshortener": {
+            "tinyurl": {"root": "https://tinyurl.com"}
        }
    },

--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -1270,6 +1270,22 @@ Consider all sites to be NSFW unless otherwise known.
    <td></td>
 </tr>

+<tr>
+    <td colspan="4"><strong>URL Shorteners</strong></td>
+</tr>
+<tr>
+    <td>Bitly</td>
+    <td>https://bit.ly/</td>
+    <td>Links</td>
+    <td></td>
+</tr>
+<tr>
+    <td>Twitter t.co</td>
+    <td>https://t.co/</td>
+    <td>Links</td>
+    <td></td>
+</tr>
+
 <tr>
    <td colspan="4"><strong>vichan Imageboards</strong></td>
 </tr>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -153,6 +153,7 @@ modules = [
    "twitter",
    "unsplash",
    "uploadir",
+    "urlshortener",
    "vanillarock",
    "vichan",
    "vk",
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for general-purpose URL shorteners"""
+
+from .common import BaseExtractor, Message
+from .. import exception
+
+
+class UrlshortenerExtractor(BaseExtractor):
+    """Base class for URL shortener extractors"""
+    basecategory = "urlshortener"
+
+
+INSTANCES = {
+    "bitly": {
+        "root": "https://bit.ly",
+        "pattern": r"bit\.ly",
+    },
+    "tco": {
+        # t.co sends 'http-equiv="refresh"' (200) when using browser UA
+        "headers": {"User-Agent": None},
+        "root": "https://t.co",
+        "pattern": r"t\.co",
+    },
+}
+
+BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES)
+
+
+class UrlshortenerLinkExtractor(UrlshortenerExtractor):
+    """Extractor for general-purpose URL shorteners"""
+    subcategory = "link"
+    pattern = BASE_PATTERN + r"/([^/?&#]+)"
+    test = (
+        ("https://bit.ly/3cWIUgq", {
+            "count": 1,
+            "pattern": "^https://gumroad.com/l/storm_b1",
+        }),
+        ("https://t.co/bCgBY8Iv5n", {
+            "count": 1,
+            "pattern": "^https://twitter.com/elonmusk/status/"
+                       "1421395561324896257/photo/1",
+        }),
+        ("https://t.co/abcdefghij", {
+            "exception": exception.NotFoundError,
+        }),
+    )
+
+    def __init__(self, match):
+        UrlshortenerExtractor.__init__(self, match)
+        self.id = match.group(match.lastindex)
+
+        try:
+            self.headers = INSTANCES[self.category]["headers"]
+        except Exception:
+            self.headers = None
+
+    def items(self):
+        response = self.request(
+            "{}/{}".format(self.root, self.id), headers=self.headers,
+            method="HEAD", allow_redirects=False, notfound="URL")
+        try:
+            yield Message.Queue, response.headers["location"], {}
+        except KeyError:
+            raise exception.StopExtraction("Unable to resolve short URL")
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -112,6 +112,7 @@ CATEGORY_MAP = {
    "subscribestar"  : "SubscribeStar",
    "tbib"           : "The Big ImageBoard",
    "tcbscans"       : "TCB Scans",
+    "tco"            : "Twitter t.co",
    "thatpervert"    : "ThatPervert",
    "thebarchive"    : "The /b/ Archive",
    "thecollection"  : "The /co/llection",
@ -132,6 +133,7 @@ CATEGORY_MAP = {
 }

 SUBCATEGORY_MAP = {
+    ""       : "",
    "art"    : "Art",
    "audio"  : "Audio",
    "doujin" : "Doujin",
@ -266,6 +268,7 @@ BASE_MAP = {
    "lynxchan"    : "LynxChan Imageboards",
    "moebooru"    : "Moebooru and MyImouto",
    "szurubooru"  : "szurubooru Instances",
+    "urlshortener": "URL Shorteners",
    "vichan"      : "vichan Imageboards",
 }