# -*- coding: utf-8 -*- # Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.behance.net/""" from .common import Extractor, Message from .. import text, util, exception class BehanceExtractor(Extractor): """Base class for behance extractors""" category = "behance" root = "https://www.behance.net" request_interval = (2.0, 4.0) def _init(self): self._bcp = self.cookies.get("bcp", domain="www.behance.net") if not self._bcp: self._bcp = "4c34489d-914c-46cd-b44c-dfd0e661136d" self.cookies.set("bcp", self._bcp, domain="www.behance.net") def items(self): for gallery in self.galleries(): gallery["_extractor"] = BehanceGalleryExtractor yield Message.Queue, gallery["url"], self._update(gallery) def galleries(self): """Return all relevant gallery URLs""" def _request_graphql(self, endpoint, variables): url = self.root + "/v3/graphql" headers = { "Origin": self.root, "X-BCP" : self._bcp, "X-Requested-With": "XMLHttpRequest", } data = { "query" : GRAPHQL_QUERIES[endpoint], "variables": variables, } return self.request(url, method="POST", headers=headers, json=data).json()["data"] def _update(self, data): # compress data to simple lists if data.get("fields") and isinstance(data["fields"][0], dict): data["fields"] = [ field.get("name") or field.get("label") for field in data["fields"] ] data["owners"] = [ owner.get("display_name") or owner.get("displayName") for owner in data["owners"] ] tags = data.get("tags") or () if tags and isinstance(tags[0], dict): tags = [tag["title"] for tag in tags] data["tags"] = tags data["date"] = text.parse_timestamp( data.get("publishedOn") or data.get("conceived_on") or 0) # backwards compatibility data["gallery_id"] = data["id"] data["title"] = data["name"] data["user"] = ", ".join(data["owners"]) return data class BehanceGalleryExtractor(BehanceExtractor): """Extractor for image galleries from www.behance.net""" subcategory = "gallery" directory_fmt = ("{category}", "{owners:J, }", "{id} {name}") filename_fmt = "{category}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" pattern = r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)" example = "https://www.behance.net/gallery/12345/TITLE" def __init__(self, match): BehanceExtractor.__init__(self, match) self.gallery_id = match.group(1) def _init(self): BehanceExtractor._init(self) modules = self.config("modules") if modules: if isinstance(modules, str): modules = modules.split(",") self.modules = set(modules) else: self.modules = {"image", "video", "mediacollection", "embed"} def items(self): data = self.get_gallery_data() imgs = self.get_images(data) data["count"] = len(imgs) yield Message.Directory, data for data["num"], (url, module) in enumerate(imgs, 1): data["module"] = module data["extension"] = (module.get("extension") or text.ext_from_url(url)) yield Message.Url, url, data def get_gallery_data(self): """Collect gallery info dict""" url = "{}/gallery/{}/a".format(self.root, self.gallery_id) cookies = { "gki": '{"feature_project_view":false,' '"feature_discover_login_prompt":false,' '"feature_project_login_prompt":false}', "ilo0": "true", } page = self.request(url, cookies=cookies).text data = util.json_loads(text.extr( page, 'id="beconfig-store_state">', '')) return self._update(data["project"]["project"]) def get_images(self, data): """Extract image results from an API response""" if not data["modules"]: access = data.get("matureAccess") if access == "logged-out": raise exception.AuthorizationError( "Mature content galleries require logged-in cookies") if access == "restricted-safe": raise exception.AuthorizationError( "Mature content blocked in account settings") if access and access != "allowed": raise exception.AuthorizationError() return () result = [] append = result.append for module in data["modules"]: mtype = module["__typename"][:-6].lower() if mtype not in self.modules: self.log.debug("Skipping '%s' module", mtype) continue if mtype == "image": sizes = { size["url"].rsplit("/", 2)[1]: size for size in module["imageSizes"]["allAvailable"] } size = (sizes.get("source") or sizes.get("max_3840") or sizes.get("fs") or sizes.get("hd") or sizes.get("disp")) append((size["url"], module)) elif mtype == "video": try: url = text.extr(module["embed"], 'src="', '"') page = self.request(text.unescape(url)).text url = text.extr(page, '