From 4ec929dc9b55a2588b4a27e64871c5bfa900bf37 Mon Sep 17 00:00:00 2001 From: huohuarong Date: Sat, 3 Aug 2013 10:29:58 +0800 Subject: [PATCH] use ..utils/clean_html() --- youtube_dl/extractor/sohu.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 830814221..cf0ab5478 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -7,7 +7,7 @@ import urllib2 from .common import InfoExtractor -from ..utils import compat_urllib_request +from ..utils import compat_urllib_request, clean_html class SohuIE(InfoExtractor): @@ -22,16 +22,6 @@ class SohuIE(InfoExtractor): }, } - def _clearn_html(self, string): - tags = re.findall(r'<.+?>', string) - for t in tags: - string = string.replace(t, ' ') - for i in range(2): - spaces = re.findall(r'\s+', string) - for s in spaces: - string = string.replace(s, ' ') - string = string.strip() - return string def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -40,7 +30,7 @@ def _real_extract(self, url): pattern = r'

\n*?(.+?)\n*?

' compiled = re.compile(pattern, re.DOTALL) title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') - title = self._clearn_html(title) + title = clean_html(title) pattern = re.compile(r'var vid="(\d+)"') result = re.search(pattern, webpage) if not result: @@ -93,5 +83,8 @@ def _real_extract(self, url): } files_info.append(info) time.sleep(1) - + if num_of_parts == 1: + info = files_info[0] + info['id'] = video_id + return info return files_info