1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2024-11-02 17:22:31 +01:00

[bandcamp] restore album downloads

flake8 conform
This commit is contained in:
insaneracist 2020-10-27 19:21:34 -07:00
parent 6f8557ec4d
commit 48aac9fc86
2 changed files with 102 additions and 83 deletions

View File

@ -25,7 +25,45 @@
) )
class BandcampIE(InfoExtractor): class BandcampBaseIE(InfoExtractor):
"""Provide base functions for Bandcamp extractors"""
def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
json_string = self._html_search_regex(
r' data-%s="([^"]*)' % suffix,
webpage, '%s json' % suffix, default='{}')
return self._parse_json(json_string, video_id)
def _parse_json_track(self, json):
formats = []
file_ = json.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
return {
'duration': float_or_none(json.get('duration')),
'id': str_or_none(json.get('track_id') or json.get('id')),
'title': json.get('title'),
'title_link': json.get('title_link'),
'number': int_or_none(json.get('track_num')),
'formats': formats
}
class BandcampIE(BandcampBaseIE):
IE_NAME = "Bandcamp:track"
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song', 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song',
@ -85,52 +123,32 @@ class BandcampIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
title = mobj.group('title') title = mobj.group('title')
url_track_title = title
webpage = self._download_webpage(url, title) webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None) thumbnail = self._html_search_meta('og:image', webpage, default=None)
track_id = None json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title)
track = None json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title)
track_number = None
duration = None
formats = [] json_tracks = json_tralbum.get('trackinfo')
trackinfo_block = self._html_search_regex( if not json_tracks:
r'trackinfo(?:["\']|&quot;):\[\s*({.+?})\s*\],(?:["\']|&quot;)', raise ExtractorError('Could not extract track')
webpage, 'track info', default='{}')
track_info = self._parse_json(trackinfo_block, title) track = self._parse_json_track(json_tracks[0])
if track_info: artist = json_tralbum.get('artist')
file_ = track_info.get('file') album_title = json_embed.get('album_title')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) json_album = json_tralbum.get('packages')
track_number = int_or_none(track_info.get('track_num')) if json_album:
duration = float_or_none(track_info.get('duration')) json_album = json_album[0]
album_publish_date = json_album.get('album_publish_date')
album_release_date = json_album.get('album_release_date')
else:
album_publish_date = None
album_release_date = json_tralbum.get('album_release_date')
def extract(key): timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
data = self._html_search_regex( release_date = unified_strdate(album_release_date)
r',(["\']|&quot;)%s\1:\1(?P<value>(?:\\\1|((?!\1).))+)\1' % key,
webpage, key, default=None, group='value')
return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data
track = extract('title')
artist = extract('artist')
album = extract('album_title')
timestamp = unified_timestamp(
extract('publish_date') or extract('album_publish_date'))
release_date = unified_strdate(extract('album_release_date'))
download_link = self._search_regex( download_link = self._search_regex(
r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage, r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
@ -155,8 +173,6 @@ def extract(key):
if info: if info:
downloads = info.get('downloads') downloads = info.get('downloads')
if isinstance(downloads, dict): if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist: if not artist:
artist = info.get('artist') artist = info.get('artist')
if not thumbnail: if not thumbnail:
@ -190,7 +206,7 @@ def extract(key):
retry_url = url_or_none(stat.get('retry_url')) retry_url = url_or_none(stat.get('retry_url'))
if not retry_url: if not retry_url:
continue continue
formats.append({ track['formats'].append({
'url': self._proto_relative_url(retry_url, 'http:'), 'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id), 'ext': download_formats.get(format_id),
'format_id': format_id, 'format_id': format_id,
@ -199,32 +215,37 @@ def extract(key):
'vcodec': 'none', 'vcodec': 'none',
}) })
self._sort_formats(formats) self._sort_formats(track['formats'])
title = '%s - %s' % (artist, track) if artist else track title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title')
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return { return {
'id': track_id, 'album': album_title,
'title': title,
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'release_date': release_date,
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist, 'artist': artist,
'album': album, 'duration': track['duration'],
'formats': formats, 'formats': track['formats'],
'id': track['id'],
'release_date': release_date,
'thumbnail': thumbnail,
'timestamp': timestamp,
'title': title,
'track': track['title'],
'track_id': track['id'],
'track_number': track['number'],
'uploader': artist
} }
class BandcampAlbumIE(InfoExtractor): class BandcampAlbumTrackIE(BandcampIE):
IE_NAME = "Bandcamp:albumtrack"
"""Hack class to force album downloads to have prefixed track numbers by default"""
def _real_extract(self, url):
data = super()._real_extract(url)
data['title'] = '{:02d} - {} - {}'.format(data['track_number'], data['artist'], data['track'])
return data
class BandcampAlbumIE(BandcampBaseIE):
IE_NAME = 'Bandcamp:album' IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
@ -305,34 +326,32 @@ def _real_extract(self, url):
album_id = mobj.group('album_id') album_id = mobj.group('album_id')
playlist_id = album_id or uploader_id playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, playlist_id)
track_elements = re.findall(
r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id)
if not track_elements: json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id)
raise ExtractorError('The page doesn\'t contain any tracks')
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract album tracks')
album_title = json_embed.get('album_title')
# Only tracks with duration info have songs # Only tracks with duration info have songs
tracks = [self._parse_json_track(track) for track in json_tracks]
entries = [ entries = [
self.url_result( self.url_result(
compat_urlparse.urljoin(url, t_path), compat_urlparse.urljoin(url, track['title_link']),
ie=BandcampIE.ie_key(), ie=BandcampAlbumTrackIE.ie_key(),
video_title=self._search_regex( video_title=track['title'])
r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', for track in tracks
elem_content, 'track title', fatal=False)) if track.get('duration')]
for elem_content, t_path in track_elements
if self._html_search_meta('duration', elem_content, default=None)]
title = self._html_search_regex(
r'album_title\s*(?:&quot;|["\']):\s*(&quot;|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1',
webpage, 'title', fatal=False, group='album')
if title:
title = title.replace(r'\"', '"')
return { return {
'_type': 'playlist', '_type': 'playlist',
'uploader_id': uploader_id, 'uploader_id': uploader_id,
'id': playlist_id, 'id': playlist_id,
'title': title, 'title': album_title,
'entries': entries, 'entries': entries
} }

View File

@ -84,7 +84,7 @@
) )
from .azmedien import AZMedienIE from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE from .baidu import BaiduVideoIE
from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bandcamp import BandcampIE, BandcampAlbumTrackIE, BandcampAlbumIE, BandcampWeeklyIE
from .bbc import ( from .bbc import (
BBCCoUkIE, BBCCoUkIE,
BBCCoUkArticleIE, BBCCoUkArticleIE,