1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-25 20:22:36 +01:00

[blogger] implement video extraction (closes #587)

This commit is contained in:
Mike Fährmann 2020-01-23 22:42:56 +01:00
parent b3b5754f2d
commit 6703b8a86b
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
3 changed files with 46 additions and 9 deletions

View File

@ -496,6 +496,15 @@ Description Try to follow external URLs of embedded players.
=========== ===== =========== =====
extractor.blogger.videos
------------------------
=========== =====
Type ``bool``
Default ``true``
Description Download embedded videos hosted on https://www.blogger.com/
=========== =====
extractor.danbooru.ugoira extractor.danbooru.ugoira
------------------------- -------------------------
=========== ===== =========== =====

View File

@ -17,6 +17,10 @@
{ {
"external": false "external": false
}, },
"blogger":
{
"videos": true
},
"danbooru": "danbooru":
{ {
"username": null, "username": null,

View File

@ -10,6 +10,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text
import json
import re import re
BASE_PATTERN = ( BASE_PATTERN = (
@ -28,6 +29,7 @@ class BloggerExtractor(Extractor):
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.videos = self.config("videos", True)
self.blog = match.group(1) or match.group(2) self.blog = match.group(1) or match.group(2)
self.api = BloggerAPI(self) self.api = BloggerAPI(self)
@ -41,24 +43,41 @@ class BloggerExtractor(Extractor):
del blog["selfLink"] del blog["selfLink"]
sub = re.compile(r"/s\d+/").sub sub = re.compile(r"/s\d+/").sub
findall = re.compile( findall_image = re.compile(
r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall
findall_video = re.compile(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
for post in self.posts(blog): for post in self.posts(blog):
images = findall(post["content"]) content = post["content"]
if not images:
files = findall_image(content)
for idx, url in enumerate(files):
files[idx] = sub("/s0/", url).replace("http:", "https:", 1)
if self.videos and 'id="BLOG_video-' in content:
page = self.request(post["url"]).text
for url in findall_video(page):
page = self.request(url).text
video_config = json.loads(text.extract(
page, 'var VIDEO_CONFIG =', '\n')[0])
files.append(max(
video_config["streams"],
key=lambda x: x["format_id"],
)["play_url"])
if not files:
continue continue
post["author"] = post["author"]["displayName"] post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"] post["replies"] = post["replies"]["totalItems"]
post["content"] = text.remove_html(post["content"]) post["content"] = text.remove_html(content)
post["date"] = text.parse_datetime(post["published"]) post["date"] = text.parse_datetime(post["published"])
del post["selfLink"] del post["selfLink"]
del post["blog"] del post["blog"]
yield Message.Directory, {"blog": blog, "post": post} yield Message.Directory, {"blog": blog, "post": post}
for num, url in enumerate(images, 1): for num, url in enumerate(files, 1):
url = sub("/s0/", url).replace("http:", "https:", 1)
yield Message.Url, url, text.nameext_from_url(url, { yield Message.Url, url, text.nameext_from_url(url, {
"blog": blog, "blog": blog,
"post": post, "post": post,
@ -112,6 +131,11 @@ class BloggerPostExtractor(BloggerExtractor):
("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", { ("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", {
"url": "9928429fb62f712eb4de80f53625eccecc614aae", "url": "9928429fb62f712eb4de80f53625eccecc614aae",
}), }),
# video (#587)
(("http://cfnmscenesinmovies.blogspot.com/2011/11/"
"cfnm-scene-jenna-fischer-in-office.html"), {
"pattern": r"https://.+\.googlevideo\.com/videoplayback",
}),
) )
def __init__(self, match): def __init__(self, match):
@ -171,8 +195,8 @@ class BloggerAPI():
def _pagination(self, endpoint, params): def _pagination(self, endpoint, params):
while True: while True:
data = self._call(endpoint, params) data = self._call(endpoint, params)
if "items" in data:
yield from data["items"] yield from data["items"]
if "nextPageToken" not in data: if "nextPageToken" not in data:
return return
params["pageToken"] = data["nextPageToken"] params["pageToken"] = data["nextPageToken"]