2017-12-13 17:38:29 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2021-02-08 22:29:30 +01:00
|
|
|
# Copyright 2016-2021 Mike Fährmann, Leonardo Taccari
|
2017-12-13 17:38:29 +01:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2021-04-15 17:15:59 +02:00
|
|
|
"""Extractors for https://www.slideshare.net/"""
|
2017-12-13 17:38:29 +01:00
|
|
|
|
|
|
|
from .common import Extractor, Message
|
2018-04-20 14:53:21 +02:00
|
|
|
from .. import text
|
2017-12-13 17:38:29 +01:00
|
|
|
|
|
|
|
|
2018-04-18 18:06:30 +02:00
|
|
|
class SlidesharePresentationExtractor(Extractor):
|
2017-12-13 17:38:29 +01:00
|
|
|
"""Extractor for images from a presentation on slideshare.net"""
|
|
|
|
category = "slideshare"
|
|
|
|
subcategory = "presentation"
|
2019-02-08 13:45:40 +01:00
|
|
|
directory_fmt = ("{category}", "{user}")
|
2017-12-13 21:15:05 +01:00
|
|
|
filename_fmt = "{presentation}-{num:>02}.{extension}"
|
2018-01-30 22:49:16 +01:00
|
|
|
archive_fmt = "{presentation}_{num}"
|
2019-02-08 13:45:40 +01:00
|
|
|
pattern = (r"(?:https?://)?(?:www\.)?slideshare\.net"
|
2020-10-22 23:12:59 +02:00
|
|
|
r"/(?:mobile/)?([^/?#]+)/([^/?#]+)")
|
2019-02-08 13:45:40 +01:00
|
|
|
test = (
|
2018-01-10 14:11:54 +01:00
|
|
|
(("https://www.slideshare.net"
|
|
|
|
"/Slideshare/get-started-with-slide-share"), {
|
2017-12-13 17:38:29 +01:00
|
|
|
"url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
|
2021-11-01 02:58:53 +01:00
|
|
|
"content": "2e90a01c6ca225579ebf8f98ab46f97a28a5e45c",
|
2017-12-13 17:38:29 +01:00
|
|
|
}),
|
2017-12-13 21:15:05 +01:00
|
|
|
# long title
|
|
|
|
(("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
|
|
|
|
"-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
|
|
|
|
"url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
|
|
|
|
}),
|
2018-01-10 14:11:54 +01:00
|
|
|
# mobile URL
|
|
|
|
(("https://www.slideshare.net"
|
|
|
|
"/mobile/uqudent/introduction-to-fixed-prosthodontics"), {
|
|
|
|
"url": "59993ad7b0cb93c73011547eedcd02c622649e9d",
|
|
|
|
}),
|
2019-02-08 13:45:40 +01:00
|
|
|
)
|
2017-12-13 17:38:29 +01:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
Extractor.__init__(self, match)
|
2017-12-13 17:38:29 +01:00
|
|
|
self.user, self.presentation = match.groups()
|
|
|
|
|
|
|
|
def items(self):
|
2017-12-13 21:15:05 +01:00
|
|
|
page = self.request("https://www.slideshare.net/" + self.user +
|
|
|
|
"/" + self.presentation).text
|
2017-12-13 17:38:29 +01:00
|
|
|
data = self.get_job_metadata(page)
|
|
|
|
imgs = self.get_image_urls(page)
|
|
|
|
data["count"] = len(imgs)
|
|
|
|
yield Message.Directory, data
|
|
|
|
for data["num"], url in enumerate(imgs, 1):
|
|
|
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
|
|
|
|
|
|
|
def get_job_metadata(self, page):
|
|
|
|
"""Collect metadata for extractor-job"""
|
2017-12-13 21:15:05 +01:00
|
|
|
descr, pos = text.extract(
|
|
|
|
page, '<meta name="description" content="', '"')
|
2021-04-15 17:15:59 +02:00
|
|
|
category, pos = text.extract(
|
|
|
|
page, '<div class="metadata-item">', '</div>', pos)
|
2017-12-13 21:15:05 +01:00
|
|
|
views, pos = text.extract(
|
2021-04-15 17:15:59 +02:00
|
|
|
page, '<div class="metadata-item">', '</div>', pos)
|
2017-12-13 21:15:05 +01:00
|
|
|
published, pos = text.extract(
|
2021-04-15 17:15:59 +02:00
|
|
|
page, '<div class="metadata-item">', '</div>', pos)
|
|
|
|
title, pos = text.extract(
|
|
|
|
page, '<span class="j-title-breadcrumb">', '</span>', pos)
|
2017-12-13 21:15:05 +01:00
|
|
|
alt_descr, pos = text.extract(
|
2021-04-15 17:15:59 +02:00
|
|
|
page, '<p class="slideshow-description notranslate">', '</p>', pos)
|
2017-12-13 17:38:29 +01:00
|
|
|
|
2017-12-13 21:15:05 +01:00
|
|
|
if descr.endswith("…") and alt_descr:
|
|
|
|
descr = text.remove_html(alt_descr).strip()
|
2017-12-13 17:38:29 +01:00
|
|
|
|
2017-12-13 21:15:05 +01:00
|
|
|
return {
|
|
|
|
"user": self.user,
|
|
|
|
"presentation": self.presentation,
|
|
|
|
"title": text.unescape(title.strip()),
|
|
|
|
"description": text.unescape(descr),
|
2021-04-15 17:15:59 +02:00
|
|
|
"views": text.parse_int(views.rpartition(
|
|
|
|
" views")[0].replace(",", "")),
|
|
|
|
"published": published.strip(),
|
2017-12-13 21:15:05 +01:00
|
|
|
}
|
2017-12-13 17:38:29 +01:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_image_urls(page):
|
|
|
|
"""Extract and return a list of all image-urls"""
|
|
|
|
return list(text.extract_iter(page, 'data-full="', '"'))
|