gallery-dl/gallery_dl/extractor/bcy.py

# -*- coding: utf-8 -*-

# Copyright 2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://bcy.net/"""

from .common import Extractor, Message
from .. import text
import json
import re


class BcyExtractor(Extractor):
    """Base class for bcy extractors"""
    category = "bcy"
    directory_fmt = ("{category}", "{user[id]} {user[name]}")
    filename_fmt = "{post[id]} {id}.{extension}"
    archive_fmt = "{post[id]}_{id}"
    root = "https://bcy.net"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.item_id = match.group(1)

    def items(self):
        sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
        iroot = "https://img-bcy-qn.pstatp.com"
        noop = self.config("noop")

        for post_id in self.posts():
            post = self._parse_post(post_id)
            if not post:
                continue

            yield Message.Directory, post
            for post["num"], image in enumerate(post["_multi"], 1):
                post["id"] = image["mid"]
                post["width"] = image["w"]
                post["height"] = image["h"]

                url = image["path"].partition("~")[0]
                text.nameext_from_url(url, post)

                if post["extension"]:
                    if not url.startswith(iroot):
                        url = sub(iroot, url)
                    post["filter"] = ""
                    yield Message.Url, url, post

                else:
                    post["filter"] = "watermark"
                    yield Message.Url, image["origin"], post

                    if noop:
                        post["extension"] = ""
                        post["filter"] = "noop"
                        yield Message.Url, image["original_path"], post

    def _parse_post(self, post_id):
        url = "{}/item/detail/{}".format(self.root, post_id)
        response = self.request(url)
        if response.status_code >= 400:
            return None

        data = json.loads(
            text.extract(response.text, 'JSON.parse("', '");')[0]
            .replace('\\\\u002F', '/')
            .replace('\\"', '"')
        )["detail"]

        post = data["post_data"]
        if not post["multi"]:
            return None
        user = data["detail_user"]

        return {
            "user": {
                "id"     : user["uid"],
                "name"   : user["uname"],
                "avatar" : user["avatar"],
            },
            "post": {
                "id"     : text.parse_int(post["item_id"]),
                "tags"   : [t["tag_name"] for t in post["post_tags"]],
                "date"   : text.parse_timestamp(post["ctime"]),
                "parody" : text.parse_unicode_escapes(post["work"]),
                "content": post["plain"],
                "likes"  : post["like_count"],
                "shares" : post["share_count"],
                "replies": post["reply_count"],
            },
            "_multi": post["multi"],
        }


class BcyUserExtractor(BcyExtractor):
    """Extractor for user timelines"""
    subcategory = "user"
    pattern = r"(?:https?://)?bcy\.net/u/(\d+)"
    test = ("https://bcy.net/u/1933712", {
        "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/\w+.jpg",
        "count": ">= 25",
    })

    def posts(self):
        url = self.root + "/apiv3/user/selfPosts"
        params = {
            "uid": self.item_id,
            "since": None,
            #  "_signature": None,
        }

        while True:
            data = self.request(url, params=params).json()

            item = None
            for item in data["data"]["items"]:
                if item["item_detail"]["multi"]:
                    yield item["item_detail"]["item_id"]

            if not item:
                return
            params["since"] = item["since"]


class BcyPostExtractor(BcyExtractor):
    """Extractor for individual posts"""
    subcategory = "post"
    pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"
    test = ("https://bcy.net/item/detail/6355835481002893070", {
        "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",
        "count": 1,
        "keyword": {
            "user": {
                "id"     : 1933712,
                "name"   : "wukloo",
                "avatar" : str,
            },
            "post": {
                "id"     : 6355835481002893070,
                "tags"   : list,
                "date"   : "dt:2016-11-22 08:47:46",
                "parody" : "东方PROJECT",
                "content": "re:根据微博的建议稍微做了点修改",
                "likes"  : int,
                "shares" : int,
                "replies": int,
            },
            "id": 8330182,
            "num": 1,
            "width" : 3000,
            "height": 1687,
            "filename": "712e0780b09011e696f973c3d1568337",
            "extension": "jpg",
        },
    })

    def posts(self):
        return (self.item_id,)
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00			`# -- coding: utf-8 --`

			`# Copyright 2020 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://bcy.net/"""`

			`from .common import Extractor, Message`
			`from .. import text`
			`import json`
			`import re`


			`class BcyExtractor(Extractor):`
			`"""Base class for bcy extractors"""`
			`category = "bcy"`
			`directory_fmt = ("{category}", "{user[id]} {user[name]}")`
			`filename_fmt = "{post[id]} {id}.{extension}"`
			`archive_fmt = "{post[id]}_{id}"`
			`root = "https://bcy.net"`

			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
			`self.item_id = match.group(1)`

			`def items(self):`
			`sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub`
			`iroot = "https://img-bcy-qn.pstatp.com"`
[bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" 2020-02-28 22:57:10 +01:00			`noop = self.config("noop")`
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00
[bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" 2020-02-28 22:57:10 +01:00			`for post_id in self.posts():`
			`post = self._parse_post(post_id)`
			`if not post:`
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00			`continue`

[bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" 2020-02-28 22:57:10 +01:00			`yield Message.Directory, post`
			`for post["num"], image in enumerate(post["_multi"], 1):`
			`post["id"] = image["mid"]`
			`post["width"] = image["w"]`
			`post["height"] = image["h"]`

			`url = image["path"].partition("~")[0]`
			`text.nameext_from_url(url, post)`

			`if post["extension"]:`
			`if not url.startswith(iroot):`
			`url = sub(iroot, url)`
			`post["filter"] = ""`
			`yield Message.Url, url, post`

			`else:`
			`post["filter"] = "watermark"`
			`yield Message.Url, image["origin"], post`

			`if noop:`
			`post["extension"] = ""`
			`post["filter"] = "noop"`
			`yield Message.Url, image["original_path"], post`

			`def _parse_post(self, post_id):`
			`url = "{}/item/detail/{}".format(self.root, post_id)`
			`response = self.request(url)`
			`if response.status_code >= 400:`
			`return None`

			`data = json.loads(`
			`text.extract(response.text, 'JSON.parse("', '");')[0]`
			`.replace('\\\\u002F', '/')`
			`.replace('\\"', '"')`
			`)["detail"]`

			`post = data["post_data"]`
			`if not post["multi"]:`
			`return None`
			`user = data["detail_user"]`

			`return {`
			`"user": {`
			`"id" : user["uid"],`
			`"name" : user["uname"],`
			`"avatar" : user["avatar"],`
			`},`
			`"post": {`
			`"id" : text.parse_int(post["item_id"]),`
			`"tags" : [t["tag_name"] for t in post["post_tags"]],`
			`"date" : text.parse_timestamp(post["ctime"]),`
			`"parody" : text.parse_unicode_escapes(post["work"]),`
			`"content": post["plain"],`
			`"likes" : post["like_count"],`
			`"shares" : post["share_count"],`
			`"replies": post["reply_count"],`
			`},`
			`"_multi": post["multi"],`
			`}`
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00

			`class BcyUserExtractor(BcyExtractor):`
			`"""Extractor for user timelines"""`
			`subcategory = "user"`
			`pattern = r"(?:https?://)?bcy\.net/u/(\d+)"`
			`test = ("https://bcy.net/u/1933712", {`
			`"pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/\w+.jpg",`
			`"count": ">= 25",`
			`})`

			`def posts(self):`
			`url = self.root + "/apiv3/user/selfPosts"`
			`params = {`
			`"uid": self.item_id,`
			`"since": None,`
			`# "_signature": None,`
			`}`

			`while True:`
			`data = self.request(url, params=params).json()`

			`item = None`
			`for item in data["data"]["items"]:`
[bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" 2020-02-28 22:57:10 +01:00			`if item["item_detail"]["multi"]:`
			`yield item["item_detail"]["item_id"]`
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00
			`if not item:`
			`return`
			`params["since"] = item["since"]`


			`class BcyPostExtractor(BcyExtractor):`
			`"""Extractor for individual posts"""`
			`subcategory = "post"`
			`pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"`
			`test = ("https://bcy.net/item/detail/6355835481002893070", {`
			`"url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",`
			`"count": 1,`
			`"keyword": {`
			`"user": {`
			`"id" : 1933712,`
			`"name" : "wukloo",`
[bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" 2020-02-28 22:57:10 +01:00			`"avatar" : str,`
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00			`},`
			`"post": {`
			`"id" : 6355835481002893070,`
			`"tags" : list,`
add tests for specific datetime values 2020-02-23 16:48:30 +01:00			`"date" : "dt:2016-11-22 08:47:46",`
[bcy] add user and post extractors (#592) 2020-02-08 23:25:53 +01:00			`"parody" : "东方PROJECT",`
			`"content": "re:根据微博的建议稍微做了点修改",`
			`"likes" : int,`
			`"shares" : int,`
			`"replies": int,`
			`},`
			`"id": 8330182,`
			`"num": 1,`
			`"width" : 3000,`
			`"height": 1687,`
			`"filename": "712e0780b09011e696f973c3d1568337",`
			`"extension": "jpg",`
			`},`
			`})`

			`def posts(self):`
[bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" 2020-02-28 22:57:10 +01:00			`return (self.item_id,)`