[deviantart] fix issue with small images

2024-11-22 10:42:34 +01:00 · 2015-12-06 14:24:27 +01:00 · 2015-12-06 14:24:27 +01:00 · e4a661fd6b
commit e4a661fd6b
parent 3ebd126b35
1 changed files with 23 additions and 26 deletions
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@ -13,11 +13,11 @@ from .. import text
 import re

 class DeviantArtExtractor(AsynchronousExtractor):
-
+    """Extract all works of an artist on deviantart"""
    category = "deviantart"
    directory_fmt = ["{category}", "{artist}"]
    filename_fmt = "{category}_{index}_{title}.{extension}"
-    pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com/gallery/.*"]
+    pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com(?:/gallery)?/?$"]

    def __init__(self, match):
        AsynchronousExtractor.__init__(self)
@ -57,39 +57,36 @@ class DeviantArtExtractor(AsynchronousExtractor):

    def get_image_metadata(self, image):
        """Collect metadata for an image"""
-        match = self.extract_data(image, 'title',
+        tmatch = self.extract_data(image, 'title',
            r'(.+) by (.+), ([A-Z][a-z]{2} \d+, \d{4}) in')
-        if image.startswith(" ismature"):
-            # adult image
-            url, _ = text.extract(image, 'href="', '"')
-            page = self.request(url).text
-            _     , pos = text.extract(page, ' class="dev-content-normal "', '')
-            url   , pos = text.extract(page, ' src="', '"', pos)
-            index , pos = text.extract(page, ' data-embed-id="', '"', pos)
-            width , pos = text.extract(page, ' width="', '"', pos)
-            height, pos = text.extract(page, ' height="', '"', pos)
+        hmatch = self.extract_data(image, 'href', r'[^"]+-(\d+)')
+
+        url, pos = text.extract(image, ' data-super-full-img="', '"', tmatch.end())
+        if url:
+            width , pos = text.extract(image, ' data-super-full-width="', '"', pos)
+            height, pos = text.extract(image, ' data-super-full-height="', '"', pos)
        else:
-            # normal image
-            index = self.extract_data(image, 'href', r'[^"]+-(\d+)').group(1)
-            url, pos = text.extract(image, ' data-super-full-img="', '"', match.end())
+            url, pos = text.extract(image, ' data-super-img="', '"', pos)
            if url:
-                width , pos = text.extract(image, ' data-super-full-width="', '"', pos)
-                height, pos = text.extract(image, ' data-super-full-height="', '"', pos)
-            else:
-                url   , pos = text.extract(image, ' data-super-img="', '"', pos)
                width , pos = text.extract(image, ' data-super-width="', '"', pos)
                height, pos = text.extract(image, ' data-super-height="', '"', pos)
-        data = {
-            "index": index,
-            "title": match.group(1),
-            "artist": match.group(2),
-            "date": match.group(3),
+            else:
+                page = self.request(hmatch.group(0)).text
+                _     , pos = text.extract(page, ' class="dev-content-normal "', '')
+                url   , pos = text.extract(page, ' src="', '"', pos)
+                width , pos = text.extract(page, ' width="', '"', pos)
+                height, pos = text.extract(page, ' height="', '"', pos)
+        return url, text.nameext_from_url(url, {
+            "index": hmatch.group(1),
+            "title": text.unescape(tmatch.group(1)),
+            "artist": tmatch.group(2),
+            "date": tmatch.group(3),
            "width": width,
            "height": height,
-        }
-        return url, text.nameext_from_url(url, data)
+        })

    @staticmethod
    def extract_data(txt, attr, pattern):
+        """Extract a HTML attribute and apply a regex to it"""
        txt, _ = text.extract(txt, ' %s="' % attr, '"')
        return re.match(pattern, txt)