gallery-dl/gallery_dl/extractor/bbc.py

# -*- coding: utf-8 -*-

# Copyright 2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://bbc.co.uk/"""

from .common import GalleryExtractor, Extractor, Message
from .. import text, util
import json

BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/"


class BbcGalleryExtractor(GalleryExtractor):
    """Extractor for a programme gallery on bbc.co.uk"""
    category = "bbc"
    root = "https://www.bbc.co.uk"
    directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}",
                     "{path[3:]:J - /}")
    filename_fmt = "{num:>02}.{extension}"
    archive_fmt = "{programme}_{num}"
    pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$"
    test = (
        ("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", {
            "pattern": r"https://ichef\.bbci\.co\.uk"
                       r"/images/ic/1920xn/\w+\.jpg",
            "count": 37,
            "keyword": {
                "programme": "p084qtzs",
                "path": ["BBC One", "Doctor Who", "The Timeless Children"],
            },
        }),
        ("https://www.bbc.co.uk/programmes/p084qtzs"),
    )

    def metadata(self, page):
        data = json.loads(text.extract(
            page, '<script type="application/ld+json">', '</script>')[0])
        return {
            "programme": self.gallery_url.split("/")[4],
            "path": list(util.unique_sequence(
                element["name"]
                for element in data["itemListElement"]
            )),
        }

    def images(self, page):
        width = self.config("width")
        width = width - width % 16 if width else 1920
        dimensions = "/{}xn/".format(width)

        return [
            (src.replace("/320x180_b/", dimensions),
             {"_fallback": self._fallback_urls(src, width)})
            for src in text.extract_iter(page, 'data-image-src="', '"')
        ]

    @staticmethod
    def _fallback_urls(src, max_width):
        front, _, back = src.partition("/320x180_b/")
        for width in (1920, 1600, 1280, 976):
            if width < max_width:
                yield "{}/{}xn/{}".format(front, width, back)


class BbcProgrammeExtractor(Extractor):
    """Extractor for all galleries of a bbc programme"""
    category = "bbc"
    subcategory = "programme"
    root = "https://www.bbc.co.uk"
    pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"
    test = (
        ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", {
            "pattern": BbcGalleryExtractor.pattern,
            "range": "1-50",
            "count": ">= 50",
        }),
        ("https://www.bbc.co.uk/programmes/b006q2x0/galleries?page=40", {
            "pattern": BbcGalleryExtractor.pattern,
            "count": ">= 100",
        }),
    )

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.path, self.page = match.groups()

    def items(self):
        data = {"_extractor": BbcGalleryExtractor}
        params = {"page": text.parse_int(self.page, 1)}
        galleries_url = self.root + self.path

        while True:
            page = self.request(galleries_url, params=params).text
            for programme_id in text.extract_iter(
                    page, '<a href="https://www.bbc.co.uk/programmes/', '"'):
                url = "https://www.bbc.co.uk/programmes/" + programme_id
                yield Message.Queue, url, data
            if 'rel="next"' not in page:
                return
            params["page"] += 1
[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00			`# -- coding: utf-8 --`

			`# Copyright 2021 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://bbc.co.uk/"""`

			`from .common import GalleryExtractor, Extractor, Message`
			`from .. import text, util`
			`import json`

			`BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/"`


			`class BbcGalleryExtractor(GalleryExtractor):`
			`"""Extractor for a programme gallery on bbc.co.uk"""`
			`category = "bbc"`
			`root = "https://www.bbc.co.uk"`
			`directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}",`
			`"{path[3:]:J - /}")`
			`filename_fmt = "{num:>02}.{extension}"`
			`archive_fmt = "{programme}_{num}"`
			`pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$"`
			`test = (`
			`("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", {`
			`"pattern": r"https://ichef\.bbci\.co\.uk"`
[bbc] improve image dimensions (#1706) download the 1920xN versions instead of 976x549 2021-07-29 03:22:15 +02:00			`r"/images/ic/1920xn/\w+\.jpg",`
[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00			`"count": 37,`
			`"keyword": {`
			`"programme": "p084qtzs",`
			`"path": ["BBC One", "Doctor Who", "The Timeless Children"],`
			`},`
			`}),`
			`("https://www.bbc.co.uk/programmes/p084qtzs"),`
			`)`

			`def metadata(self, page):`
			`data = json.loads(text.extract(`
			`page, '<script type="application/ld+json">', '</script>')[0])`
			`return {`
			`"programme": self.gallery_url.split("/")[4],`
			`"path": list(util.unique_sequence(`
			`element["name"]`
			`for element in data["itemListElement"]`
			`)),`
			`}`

			`def images(self, page):`
[bbc] add 'width' option (#1706) 2021-07-30 01:09:32 +02:00			`width = self.config("width")`
			`width = width - width % 16 if width else 1920`
			`dimensions = "/{}xn/".format(width)`

[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00			`return [`
[bbc] provide fallback URLs (#1706) 2021-07-30 01:14:07 +02:00			`(src.replace("/320x180_b/", dimensions),`
			`{"_fallback": self._fallback_urls(src, width)})`
[bbc] improve image dimensions (#1706) download the 1920xN versions instead of 976x549 2021-07-29 03:22:15 +02:00			`for src in text.extract_iter(page, 'data-image-src="', '"')`
[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00			`]`

[bbc] provide fallback URLs (#1706) 2021-07-30 01:14:07 +02:00			`@staticmethod`
			`def _fallback_urls(src, max_width):`
			`front, _, back = src.partition("/320x180_b/")`
			`for width in (1920, 1600, 1280, 976):`
			`if width < max_width:`
			`yield "{}/{}xn/{}".format(front, width, back)`

[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00
			`class BbcProgrammeExtractor(Extractor):`
			`"""Extractor for all galleries of a bbc programme"""`
			`category = "bbc"`
			`subcategory = "programme"`
			`root = "https://www.bbc.co.uk"`
[bbc] support multi-page gallery listings (closes #1730) 2021-07-28 22:42:33 +02:00			`pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"`
			`test = (`
			`("https://www.bbc.co.uk/programmes/b006q2x0/galleries", {`
			`"pattern": BbcGalleryExtractor.pattern,`
			`"range": "1-50",`
			`"count": ">= 50",`
			`}),`
			`("https://www.bbc.co.uk/programmes/b006q2x0/galleries?page=40", {`
			`"pattern": BbcGalleryExtractor.pattern,`
			`"count": ">= 100",`
			`}),`
			`)`
[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00
			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
[bbc] support multi-page gallery listings (closes #1730) 2021-07-28 22:42:33 +02:00			`self.path, self.page = match.groups()`
[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00
			`def items(self):`
			`data = {"_extractor": BbcGalleryExtractor}`
[bbc] support multi-page gallery listings (closes #1730) 2021-07-28 22:42:33 +02:00			`params = {"page": text.parse_int(self.page, 1)}`
			`galleries_url = self.root + self.path`
[bbc] add 'gallery' and 'programme' extractors (closes #1706) 2021-07-22 20:37:05 +02:00
[bbc] support multi-page gallery listings (closes #1730) 2021-07-28 22:42:33 +02:00			`while True:`
			`page = self.request(galleries_url, params=params).text`
			`for programme_id in text.extract_iter(`
			`page, '<a href="https://www.bbc.co.uk/programmes/', '"'):`
			`url = "https://www.bbc.co.uk/programmes/" + programme_id`
			`yield Message.Queue, url, data`
			`if 'rel="next"' not in page:`
			`return`
			`params["page"] += 1`