From e81a47460365738a0add4d4da52a712c0091704f Mon Sep 17 00:00:00 2001 From: snipem Date: Fri, 3 Apr 2015 15:34:49 +0200 Subject: [PATCH 1/5] [Gamersyde] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/gamersyde.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 youtube_dl/extractor/gamersyde.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index aae4aae4c..2935d5b33 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -177,6 +177,7 @@ GameOneIE, GameOnePlaylistIE, ) +from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gametrailers import GametrailersIE diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py new file mode 100644 index 000000000..c40106216 --- /dev/null +++ b/youtube_dl/extractor/gamersyde.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re +import json +import time +from .common import InfoExtractor + + +class GamersydeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' + _TEST = { + 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', + 'md5': 'f38d400d32f19724570040d5ce3a505f', + 'info_dict': { + 'id': '34371', + 'ext': 'mp4', + 'title': 'Bloodborne - Birth of a hero', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _calculateDuration(self, durationString): + duration = time.strptime(durationString, "%M minutes %S seconds") + return duration.tm_min * 60 + duration.tm_sec + + def _fixJsonSyntax(self, json): + + json = re.sub(r"{\s*(\w)", r'{"\1', json) + json = re.sub(r",\s*(\w)", r',"\1', json) + json = re.sub(r"(\w): ", r'\1":', json) + json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) + json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) + + return json + + def _real_extract(self, url): + + video_id = self._search_regex(r'-(.*?)_[a-z]{2}.html$', url, 'video_id') + webpage = self._download_webpage(url, video_id) + + filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) + filesJson = self._fixJsonSyntax(filesJson) + + data = json.loads(filesJson) + playlist = data[0] + + formats = [] + + title = re.sub(r"[0-9]+ - ", "", playlist['title']) + + for playlistEntry in playlist['sources']: + format = { + 'url': playlistEntry['file'], + 'format_id': playlistEntry['label'] + } + + formats.append(format) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': playlist['image'] + } From 115c281672bd7479f87c48249f6a0186ac7d19cc Mon Sep 17 00:00:00 2001 From: snipem Date: Sat, 4 Apr 2015 12:31:48 +0200 Subject: [PATCH 2/5] [Gamersyde] Improved robustness, added duration and tests Fix for Json syntax is now less error prone for Json syntax inside of values. Extractor is now also using native Json handling. Added tests for several videos that were producing errors in the first place. --- youtube_dl/extractor/gamersyde.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index c40106216..5c68a6891 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -8,7 +8,6 @@ class GamersydeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' - _TEST = { 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { @@ -17,6 +16,11 @@ class GamersydeIE(InfoExtractor): 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } + }, + { + 'url': 'http://www.gamersyde.com/hqstream_dark_souls_ii_scholar_of_the_first_sin_gameplay_part_1-34417_en.html', + 'info_dict': { + 'ext': 'mp4', } def _calculateDuration(self, durationString): @@ -27,7 +31,6 @@ def _fixJsonSyntax(self, json): json = re.sub(r"{\s*(\w)", r'{"\1', json) json = re.sub(r",\s*(\w)", r',"\1', json) - json = re.sub(r"(\w): ", r'\1":', json) json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) @@ -40,7 +43,6 @@ def _real_extract(self, url): filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) filesJson = self._fixJsonSyntax(filesJson) - data = json.loads(filesJson) playlist = data[0] From 3d24d997ae1f92686aa7edd0bfeed28353fbfb2e Mon Sep 17 00:00:00 2001 From: snipem Date: Sat, 4 Apr 2015 12:42:14 +0200 Subject: [PATCH 3/5] Fixed intendation of test cases Leaded to error on Linux machine --- youtube_dl/extractor/gamersyde.py | 45 ++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index 5c68a6891..cc6fa4037 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -1,39 +1,62 @@ # coding: utf-8 from __future__ import unicode_literals import re -import json import time + from .common import InfoExtractor class GamersydeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' + _TESTS = [{ 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { 'id': '34371', 'ext': 'mp4', + 'duration': 372, 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } - }, - { + }, { 'url': 'http://www.gamersyde.com/hqstream_dark_souls_ii_scholar_of_the_first_sin_gameplay_part_1-34417_en.html', + 'md5': '94bd7c3feff3275576cf5cb6c8a3a720', 'info_dict': { + 'id': '34417', 'ext': 'mp4', + 'duration': 270, + 'title': 'Dark Souls II: Scholar of the First Sin - Gameplay - Part 1', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, { + 'url': 'http://www.gamersyde.com/hqstream_grand_theft_auto_v_heists_trailer-33786_en.html', + 'md5': '65e442f5f340d571ece8c80d50700369', + 'info_dict': { + 'id': '33786', + 'ext': 'mp4', + 'duration': 59, + 'title': 'Grand Theft Auto V - Heists Trailer', + 'thumbnail': 're:^https?://.*\.jpg$', + } } + ] def _calculateDuration(self, durationString): - duration = time.strptime(durationString, "%M minutes %S seconds") + if (durationString.find("minutes") > -1): + duration = time.strptime(durationString, "%M minutes %S seconds") + else: + duration = time.strptime(durationString, "%S seconds") return duration.tm_min * 60 + duration.tm_sec def _fixJsonSyntax(self, json): - json = re.sub(r"{\s*(\w)", r'{"\1', json) - json = re.sub(r",\s*(\w)", r',"\1', json) json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) - + json = json.replace('file: "', '"file": "') + json = json.replace('title: "', '"title": "') + json = json.replace('label: "', '"label": "') + json = json.replace('image: "', '"image": "') + json = json.replace('sources: [', '"sources": [') return json def _real_extract(self, url): @@ -42,13 +65,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) - filesJson = self._fixJsonSyntax(filesJson) - data = json.loads(filesJson) + data = self._parse_json(filesJson,video_id, transform_source=self._fixJsonSyntax) + playlist = data[0] formats = [] title = re.sub(r"[0-9]+ - ", "", playlist['title']) + + length = self._search_regex(r'(([0-9]{1,2} minutes ){0,1}[0-9]{1,2} seconds)', webpage, 'length') + duration = self._calculateDuration(length) for playlistEntry in playlist['sources']: format = { @@ -62,5 +88,6 @@ def _real_extract(self, url): 'id': video_id, 'title': title, 'formats': formats, + 'duration': duration, 'thumbnail': playlist['image'] } From ba9e68f40261355ceae5bb87c5707adc7f7beb2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 17:48:55 +0600 Subject: [PATCH 4/5] [utils] Drop trailing comma before closing brace --- test/test_utils.py | 6 ++++++ youtube_dl/utils.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index abaf1ab73..4e524aca3 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -470,6 +470,12 @@ def test_js_to_json_edgecases(self): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + on = js_to_json('["abc", "def",]') + self.assertEqual(json.loads(on), ['abc', 'def']) + + on = js_to_json('{"abc": "def",}') + self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 90e0ed9ab..e1761265c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1577,7 +1577,7 @@ def fix_kv(m): '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| [a-zA-Z_][.a-zA-Z_0-9]* ''', fix_kv, code) - res = re.sub(r',(\s*\])', lambda m: m.group(1), res) + res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) return res From 5c29dbd0c76083eaf596f623fabb612575f71861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 17:53:22 +0600 Subject: [PATCH 5/5] [gamersyde] Simplify --- youtube_dl/extractor/gamersyde.py | 103 ++++++++++++------------------ 1 file changed, 40 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index cc6fa4037..d545e01bb 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -1,14 +1,18 @@ -# coding: utf-8 from __future__ import unicode_literals + import re -import time from .common import InfoExtractor +from ..utils import ( + js_to_json, + parse_duration, + remove_start, +) class GamersydeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' - _TESTS = [{ + _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P[\da-z_]+)-(?P\d+)_[a-z]{2}\.html' + _TEST = { 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { @@ -18,76 +22,49 @@ class GamersydeIE(InfoExtractor): 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } - }, { - 'url': 'http://www.gamersyde.com/hqstream_dark_souls_ii_scholar_of_the_first_sin_gameplay_part_1-34417_en.html', - 'md5': '94bd7c3feff3275576cf5cb6c8a3a720', - 'info_dict': { - 'id': '34417', - 'ext': 'mp4', - 'duration': 270, - 'title': 'Dark Souls II: Scholar of the First Sin - Gameplay - Part 1', - 'thumbnail': 're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.gamersyde.com/hqstream_grand_theft_auto_v_heists_trailer-33786_en.html', - 'md5': '65e442f5f340d571ece8c80d50700369', - 'info_dict': { - 'id': '33786', - 'ext': 'mp4', - 'duration': 59, - 'title': 'Grand Theft Auto V - Heists Trailer', - 'thumbnail': 're:^https?://.*\.jpg$', - } } - ] - - def _calculateDuration(self, durationString): - if (durationString.find("minutes") > -1): - duration = time.strptime(durationString, "%M minutes %S seconds") - else: - duration = time.strptime(durationString, "%S seconds") - return duration.tm_min * 60 + duration.tm_sec - - def _fixJsonSyntax(self, json): - - json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) - json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) - json = json.replace('file: "', '"file": "') - json = json.replace('title: "', '"title": "') - json = json.replace('label: "', '"label": "') - json = json.replace('image: "', '"image": "') - json = json.replace('sources: [', '"sources": [') - return json def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') - video_id = self._search_regex(r'-(.*?)_[a-z]{2}.html$', url, 'video_id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) - data = self._parse_json(filesJson,video_id, transform_source=self._fixJsonSyntax) - - playlist = data[0] + playlist = self._parse_json( + self._search_regex( + r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'), + display_id, transform_source=js_to_json) formats = [] - - title = re.sub(r"[0-9]+ - ", "", playlist['title']) - - length = self._search_regex(r'(([0-9]{1,2} minutes ){0,1}[0-9]{1,2} seconds)', webpage, 'length') - duration = self._calculateDuration(length) - - for playlistEntry in playlist['sources']: - format = { - 'url': playlistEntry['file'], - 'format_id': playlistEntry['label'] + for source in playlist['sources']: + video_url = source.get('file') + if not video_url: + continue + format_id = source.get('label') + f = { + 'url': video_url, + 'format_id': format_id, } + m = re.search(r'^(?P\d+)[pP](?P\d+)fps', format_id) + if m: + f.update({ + 'height': int(m.group('height')), + 'fps': int(m.group('fps')), + }) + formats.append(f) + self._sort_formats(formats) - formats.append(format) + title = remove_start(playlist['title'], '%s - ' % video_id) + thumbnail = playlist.get('image') + duration = parse_duration(self._search_regex( + r'Length:([^<]+)<', webpage, 'duration', fatal=False)) return { 'id': video_id, + 'display_id': display_id, 'title': title, - 'formats': formats, + 'thumbnail': thumbnail, 'duration': duration, - 'thumbnail': playlist['image'] - } + 'formats': formats, + }