fix parse title bug

This commit is contained in:
huohuarong 2013-08-05 22:51:54 +08:00
parent 4ec929dc9b
commit b5a6d40818

View File

@ -27,10 +27,10 @@ def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>' pattern = r'<title>(.+?)</title>'
compiled = re.compile(pattern, re.DOTALL) compiled = re.compile(pattern, re.DOTALL)
title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') title = self._search_regex(compiled, webpage, u'video title')
title = clean_html(title) title = clean_html(title).split('-')[0].strip()
pattern = re.compile(r'var vid="(\d+)"') pattern = re.compile(r'var vid="(\d+)"')
result = re.search(pattern, webpage) result = re.search(pattern, webpage)
if not result: if not result:
@ -41,7 +41,8 @@ def _real_extract(self, url):
base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
url_1 = base_url_1 + vid url_1 = base_url_1 + vid
logging.info('json url: %s' % url_1) logging.info('json url: %s' % url_1)
json_1 = json.loads(urllib2.urlopen(url_1).read()) webpage = self._download_webpage(url_1, vid)
json_1 = json.loads(webpage)
# get the highest definition video vid and json infomation. # get the highest definition video vid and json infomation.
vids = [] vids = []
qualities = ('oriVid', 'superVid', 'highVid', 'norVid') qualities = ('oriVid', 'superVid', 'highVid', 'norVid')