gallery-dl/gallery_dl/extractor/myportfolio.py

# -*- coding: utf-8 -*-

# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://www.myportfolio.com/"""

from .common import Extractor, Message
from .. import text, exception


class MyportfolioGalleryExtractor(Extractor):
    """Extractor for an image gallery on www.myportfolio.com"""
    category = "myportfolio"
    subcategory = "gallery"
    directory_fmt = ("{category}", "{user}", "{title}")
    filename_fmt = "{num:>02}.{extension}"
    archive_fmt = "{user}_{filename}"
    pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
               r"(?:https?://)?([\w-]+\.myportfolio\.com))"
               r"(/[^/?&#]+)?")
    test = (
        ("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", {
            "url": "acea0690c76db0e5cf267648cefd86e921bc3499",
            "keyword": "6ac6befe2ee0af921d24cf1dd4a4ed71be06db6d",
        }),
        ("https://andrewling.myportfolio.com/", {
            "pattern": r"https://andrewling\.myportfolio\.com/[^/?#+]+$",
            "count": ">= 6",
        }),
        ("https://stevenilousphotography.myportfolio.com/society", {
            "exception": exception.NotFoundError,
        }),
        # custom domain
        ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {
            "count": 3,
        }),
        ("myportfolio:https://tooco.com.ar/", {
            "pattern": pattern,
            "count": ">= 40",
        }),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        domain1, domain2, self.path = match.groups()
        self.domain = domain1 or domain2
        self.prefix = "myportfolio:" if domain1 else ""

    def items(self):
        url = "https://" + self.domain + (self.path or "")
        response = self.request(url)
        if response.history and response.url.endswith(".adobe.com/missing"):
            raise exception.NotFoundError()
        page = response.text

        projects = text.extr(
            page, '<section class="project-covers', '</section>')

        if projects:
            data = {"_extractor": MyportfolioGalleryExtractor}
            base = self.prefix + "https://" + self.domain
            for path in text.extract_iter(projects, ' href="', '"'):
                yield Message.Queue, base + path, data
        else:
            data = self.metadata(page)
            imgs = self.images(page)
            data["count"] = len(imgs)
            yield Message.Directory, data
            for data["num"], url in enumerate(imgs, 1):
                yield Message.Url, url, text.nameext_from_url(url, data)

    @staticmethod
    def metadata(page):
        """Collect general image metadata"""
        # og:title contains data as "<user> - <title>", but both
        # <user> and <title> can contain a "-" as well, so we get the title
        # from somewhere else and cut that amount from the og:title content

        extr = text.extract_from(page)
        user = extr('property="og:title" content="', '"') or \
            extr('property=og:title content="', '"')
        descr = extr('property="og:description" content="', '"') or \
            extr('property=og:description content="', '"')
        title = extr('<h1 ', '</h1>')

        if title:
            title = title.partition(">")[2]
            user = user[:-len(title)-3]
        elif user:
            user, _, title = user.partition(" - ")
        else:
            raise exception.NotFoundError()

        return {
            "user": text.unescape(user),
            "title": text.unescape(title),
            "description": text.unescape(descr),
        }

    @staticmethod
    def images(page):
        """Extract and return a list of all image-urls"""
        return (
            list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or
            list(text.extract_iter(page, 'data-src="', '"'))
        )
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`# -- coding: utf-8 --`

[myportfolio] use fallback when no images are found (#2959) 2022-09-23 13:03:33 +02:00			`# Copyright 2018-2022 Mike Fährmann`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images from https://www.myportfolio.com/"""`

			`from .common import Extractor, Message`
[myportfolio] raise 'NotFoundError' for deleted posts 2020-07-27 16:15:24 +02:00			`from .. import text, exception`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00

			`class MyportfolioGalleryExtractor(Extractor):`
			`"""Extractor for an image gallery on www.myportfolio.com"""`
			`category = "myportfolio"`
			`subcategory = "gallery"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`directory_fmt = ("{category}", "{user}", "{title}")`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`filename_fmt = "{num:>02}.{extension}"`
change results of text.nameext_from_url() Instead of getting a complete 'filename' from an URL and splitting that into 'name' and 'extension', the new approach gets rid of the complete version and renames 'name' to 'filename'. (Using anything other than {extension} for a filename extension doesn't really work anyway) Example: "https://example.org/path/filename.ext" before: - filename : filename.ext - name : filename - extension: ext now: - filename : filename - extension: ext 2019-02-14 16:07:17 +01:00			`archive_fmt = "{user}_{filename}"`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)\|"`
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 2021-12-29 22:39:29 +01:00			`r"(?:https?://)?([\w-]+\.myportfolio\.com))"`
			`r"(/[^/?&#]+)?")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`test = (`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", {`
			`"url": "acea0690c76db0e5cf267648cefd86e921bc3499",`
			`"keyword": "6ac6befe2ee0af921d24cf1dd4a4ed71be06db6d",`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`}),`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`("https://andrewling.myportfolio.com/", {`
remove '&' from URL patterns '/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components" 2020-10-22 23:12:59 +02:00			`"pattern": r"https://andrewling\.myportfolio\.com/[^/?#+]+$",`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`"count": ">= 6",`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`}),`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`("https://stevenilousphotography.myportfolio.com/society", {`
[myportfolio] raise 'NotFoundError' for deleted posts 2020-07-27 16:15:24 +02:00			`"exception": exception.NotFoundError,`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`}),`
			`# custom domain`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {`
			`"count": 3,`
			`}),`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`("myportfolio:https://tooco.com.ar/", {`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`"pattern": pattern,`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`"count": ">= 40",`
			`}),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`)`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 2019-02-11 13:31:10 +01:00			`Extractor.__init__(self, match)`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`domain1, domain2, self.path = match.groups()`
			`self.domain = domain1 or domain2`
			`self.prefix = "myportfolio:" if domain1 else ""`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00
			`def items(self):`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`url = "https://" + self.domain + (self.path or "")`
[myportfolio] fix extraction 2021-04-24 01:22:57 +02:00			`response = self.request(url)`
			`if response.history and response.url.endswith(".adobe.com/missing"):`
			`raise exception.NotFoundError()`
			`page = response.text`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00
replace 'text.extract()' with 'text.extr()' where possible 2022-11-04 23:39:38 +01:00			`projects = text.extr(`
			`page, '<section class="project-covers', '</section>')`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00
			`if projects:`
			`data = {"_extractor": MyportfolioGalleryExtractor}`
			`base = self.prefix + "https://" + self.domain`
			`for path in text.extract_iter(projects, ' href="', '"'):`
			`yield Message.Queue, base + path, data`
			`else:`
			`data = self.metadata(page)`
			`imgs = self.images(page)`
			`data["count"] = len(imgs)`
			`yield Message.Directory, data`
			`for data["num"], url in enumerate(imgs, 1):`
			`yield Message.Url, url, text.nameext_from_url(url, data)`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00
			`@staticmethod`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`def metadata(page):`
			`"""Collect general image metadata"""`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`# og:title contains data as "<user> - <title>", but both`
			`# <user> and <title> can contain a "-" as well, so we get the title`
			`# from somewhere else and cut that amount from the og:title content`

[myportfolio] fix extraction 2021-04-24 01:22:57 +02:00			`extr = text.extract_from(page)`
			`user = extr('property="og:title" content="', '"') or \`
			`extr('property=og:title content="', '"')`
			`descr = extr('property="og:description" content="', '"') or \`
			`extr('property=og:description content="', '"')`
			`title = extr('<h1 ', '</h1>')`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`if title:`
			`title = title.partition(">")[2]`
			`user = user[:-len(title)-3]`
[myportfolio] raise 'NotFoundError' for deleted posts 2020-07-27 16:15:24 +02:00			`elif user:`
[myportfolio] fix extraction of galleries without title 2020-04-08 21:08:05 +02:00			`user, _, title = user.partition(" - ")`
[myportfolio] raise 'NotFoundError' for deleted posts 2020-07-27 16:15:24 +02:00			`else:`
			`raise exception.NotFoundError()`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00
			`return {`
			`"user": text.unescape(user),`
			`"title": text.unescape(title),`
[myportfolio] fix extraction 2021-04-24 01:22:57 +02:00			`"description": text.unescape(descr),`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`}`

			`@staticmethod`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 2019-03-06 17:20:24 +01:00			`def images(page):`
[myportfolio] add user and gallery extractors (#95) 2018-07-19 18:56:45 +02:00			`"""Extract and return a list of all image-urls"""`
[myportfolio] use fallback when no images are found (#2959) 2022-09-23 13:03:33 +02:00			`return (`
			`list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or`
			`list(text.extract_iter(page, 'data-src="', '"'))`
			`)`