From 9c44d2429b90dece734df778c63b04c15e91c1ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 00:36:06 +0200 Subject: [PATCH] [vimeo:likes] Support large like lists (Fixes #3847) --- test/test_utils.py | 9 ++++- youtube_dl/extractor/vimeo.py | 66 ++++++++++++++++++++++----------- youtube_dl/extractor/youtube.py | 4 +- youtube_dl/utils.py | 39 +++++++++++++++++-- 4 files changed, 89 insertions(+), 29 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3efbed29d..6419b3ca9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -22,7 +22,8 @@ fix_xml_ampersands, get_meta_content, orderedSet, - PagedList, + OnDemandPagedList, + InAdvancePagedList, parse_duration, read_batch_urls, sanitize_filename, @@ -246,10 +247,14 @@ def get_page(pagenum): for i in range(firstid, upto): yield i - pl = PagedList(get_page, pagesize) + pl = OnDemandPagedList(get_page, pagesize) got = pl.getslice(*sliceargs) self.assertEqual(got, expected) + iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize) + got = iapl.getslice(*sliceargs) + self.assertEqual(got, expected) + testPL(5, 2, (), [0, 1, 2, 3, 4]) testPL(5, 2, (1,), [1, 2, 3, 4]) testPL(5, 2, (2,), [2, 3, 4]) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4be1b8785..403d0bb28 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,18 +8,19 @@ from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, - clean_html, - get_element_by_attribute, + compat_urlparse, ExtractorError, + get_element_by_attribute, + InAdvancePagedList, + int_or_none, RegexNotFoundError, - smuggle_url, std_headers, unsmuggle_url, urlencode_postdata, - int_or_none, ) @@ -533,32 +534,55 @@ def _real_extract(self, url): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P[0-9]+)/likes(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P[0-9]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' _TEST = { - 'url': 'https://vimeo.com/user20132939/likes', - 'playlist_mincount': 4, - 'add_ies': ['Generic'], + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, "info_dict": { - "description": "Videos Philipp Hagemeister likes on Vimeo.", - "title": "Vimeo / Philipp Hagemeister's likes", - }, - 'params': { - 'extract_flat': False, + "description": "See all the videos urza likes", + "title": 'Videos urza likes', }, } def _real_extract(self, url): user_id = self._match_id(url) - rss_url = '%s//vimeo.com/user%s/likes/rss' % ( - self.http_scheme(), user_id) - surl = smuggle_url(rss_url, { - 'force_videoid': '%s_likes' % user_id, - 'to_generic': True, - }) + webpage = self._download_webpage(url, user_id) + page_count = self._int( + self._search_regex( + r'''(?x)
  • + .*?
  • \s* + ''', webpage, 'page count'), + 'page count', fatal=True) + PAGE_SIZE = 12 + title = self._html_search_regex( + r'(?s)

    (.+?)

    ', webpage, 'title', fatal=False) + description = self._html_search_meta('description', webpage) + + def _get_page(idx): + page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( + self.http_scheme(), user_id, idx + 1) + webpage = self._download_webpage( + page_url, user_id, + note='Downloading page %d/%d' % (idx + 1, page_count)) + video_list = self._search_regex( + r'(?s)
      ]*>(.*?)
    ', + webpage, 'video content') + paths = re.findall( + r']*>\s*