Merge branch 'extract_info_rewrite'

This commit is contained in:
Jaime Marquínez Ferrándiz 2013-04-19 21:57:08 +02:00
commit dce9027045
4 changed files with 176 additions and 114 deletions

View File

@ -150,6 +150,8 @@ # OUTPUT TEMPLATE
- `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4). - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4).
- `epoch`: The sequence will be replaced by the Unix epoch when creating the file. - `epoch`: The sequence will be replaced by the Unix epoch when creating the file.
- `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero.
- `playlist`: The name or the id of the playlist that contains the video.
- `playlist_index`: The index of the video in the playlist, a five-digit number.
The current default template is `%(id)s.%(ext)s`, but that will be switchted to `%(title)s-%(id)s.%(ext)s` (which can be requested with `-t` at the moment). The current default template is `%(id)s.%(ext)s`, but that will be switchted to `%(title)s-%(id)s.%(ext)s` (which can be requested with `-t` at the moment).

View File

@ -10,6 +10,7 @@
from youtube_dl.InfoExtractors import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE from youtube_dl.InfoExtractors import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE
from youtube_dl.utils import * from youtube_dl.utils import *
from youtube_dl.FileDownloader import FileDownloader
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
@ -22,7 +23,7 @@
opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
compat_urllib_request.install_opener(opener) compat_urllib_request.install_opener(opener)
class FakeDownloader(object): class FakeDownloader(FileDownloader):
def __init__(self): def __init__(self):
self.result = [] self.result = []
self.params = parameters self.params = parameters
@ -30,35 +31,42 @@ def to_screen(self, s):
print(s) print(s)
def trouble(self, s): def trouble(self, s):
raise Exception(s) raise Exception(s)
def download(self, x): def extract_info(self, url):
self.result.append(x) self.result.append(url)
return url
class TestYoutubeLists(unittest.TestCase): class TestYoutubeLists(unittest.TestCase):
def assertIsPlaylist(self,info):
"""Make sure the info has '_type' set to 'playlist'"""
self.assertEqual(info['_type'], 'playlist')
def test_youtube_playlist(self): def test_youtube_playlist(self):
dl = FakeDownloader() dl = FakeDownloader()
ie = YoutubePlaylistIE(dl) ie = YoutubePlaylistIE(dl)
ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0]
ytie_results = [YoutubeIE()._extract_id(r[0]) for r in dl.result] self.assertIsPlaylist(result)
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE']) self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE'])
def test_issue_673(self): def test_issue_673(self):
dl = FakeDownloader() dl = FakeDownloader()
ie = YoutubePlaylistIE(dl) ie = YoutubePlaylistIE(dl)
ie.extract('PLBB231211A4F62143') result = ie.extract('PLBB231211A4F62143')[0]
self.assertTrue(len(dl.result) > 40) self.assertTrue(len(result['entries']) > 40)
def test_youtube_playlist_long(self): def test_youtube_playlist_long(self):
dl = FakeDownloader() dl = FakeDownloader()
ie = YoutubePlaylistIE(dl) ie = YoutubePlaylistIE(dl)
ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0]
self.assertTrue(len(dl.result) >= 799) self.assertIsPlaylist(result)
self.assertTrue(len(result['entries']) >= 799)
def test_youtube_playlist_with_deleted(self): def test_youtube_playlist_with_deleted(self):
#651 #651
dl = FakeDownloader() dl = FakeDownloader()
ie = YoutubePlaylistIE(dl) ie = YoutubePlaylistIE(dl)
ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0]
ytie_results = [YoutubeIE()._extract_id(r[0]) for r in dl.result] ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
self.assertFalse('pElCt5oNDuI' in ytie_results) self.assertFalse('pElCt5oNDuI' in ytie_results)
self.assertFalse('KdPEApIVdWM' in ytie_results) self.assertFalse('KdPEApIVdWM' in ytie_results)
@ -66,10 +74,11 @@ def test_youtube_course(self):
dl = FakeDownloader() dl = FakeDownloader()
ie = YoutubePlaylistIE(dl) ie = YoutubePlaylistIE(dl)
# TODO find a > 100 (paginating?) videos course # TODO find a > 100 (paginating?) videos course
ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0]
self.assertEqual(YoutubeIE()._extract_id(dl.result[0][0]), 'j9WZyLZCBzs') entries = result['entries']
self.assertEqual(len(dl.result), 25) self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
self.assertEqual(YoutubeIE()._extract_id(dl.result[-1][0]), 'rYefUsYuEp0') self.assertEqual(len(entries), 25)
self.assertEqual(YoutubeIE()._extract_id(entries[-1]['url']), 'rYefUsYuEp0')
def test_youtube_channel(self): def test_youtube_channel(self):
# I give up, please find a channel that does paginate and test this like test_youtube_playlist_long # I give up, please find a channel that does paginate and test this like test_youtube_playlist_long
@ -78,8 +87,8 @@ def test_youtube_channel(self):
def test_youtube_user(self): def test_youtube_user(self):
dl = FakeDownloader() dl = FakeDownloader()
ie = YoutubeUserIE(dl) ie = YoutubeUserIE(dl)
ie.extract('https://www.youtube.com/user/TheLinuxFoundation') result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
self.assertTrue(len(dl.result) >= 320) self.assertTrue(len(result['entries']) >= 320)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -393,6 +393,8 @@ def prepare_filename(self, info_dict):
autonumber_size = 5 autonumber_size = 5
autonumber_templ = u'%0' + str(autonumber_size) + u'd' autonumber_templ = u'%0' + str(autonumber_size) + u'd'
template_dict['autonumber'] = autonumber_templ % self._num_downloads template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict['playlist_index'] is not None:
template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
sanitize = lambda k,v: sanitize_filename( sanitize = lambda k,v: sanitize_filename(
u'NA' if v is None else compat_str(v), u'NA' if v is None else compat_str(v),
@ -423,9 +425,109 @@ def _match_entry(self, info_dict):
return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
return None return None
def extract_info(self, url, download = True):
'''
Returns a list with a dictionary for each video we find.
If 'download', also downloads the videos.
'''
suitable_found = False
for ie in self._ies:
# Go to next InfoExtractor if not suitable
if not ie.suitable(url):
continue
# Warn if the _WORKING attribute is False
if not ie.working():
self.to_stderr(u'WARNING: the program functionality for this site has been marked as broken, '
u'and will probably not work. If you want to go on, use the -i option.')
# Suitable InfoExtractor found
suitable_found = True
# Extract information from URL and process it
try:
ie_results = ie.extract(url)
results = []
for ie_result in ie_results:
if not 'extractor' in ie_result:
#The extractor has already been set somewhere else
ie_result['extractor'] = ie.IE_NAME
results.append(self.process_ie_result(ie_result, download))
return results
except ExtractorError as de: # An error we somewhat expected
self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback())
break
except Exception as e:
if self.params.get('ignoreerrors', False):
self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc()))
break
else:
raise
if not suitable_found:
self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
def process_ie_result(self, ie_result, download = True):
"""
Take the result of the ie and return a list of videos.
For url elements it will search the suitable ie and get the videos
For playlist elements it will process each of the elements of the 'entries' key
It will also download the videos if 'download'.
"""
result_type = ie_result.get('_type', 'video') #If not given we suppose it's a video, support the dafault old system
if result_type == 'video':
if 'playlist' not in ie_result:
#It isn't part of a playlist
ie_result['playlist'] = None
ie_result['playlist_index'] = None
if download:
#Do the download:
self.process_info(ie_result)
return ie_result
elif result_type == 'url':
#We get the video pointed by the url
result = self.extract_info(ie_result['url'], download)[0]
return result
elif result_type == 'playlist':
#We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
self.to_screen(u'[download] Downloading playlist: %s' % playlist)
playlist_results = []
n_all_entries = len(ie_result['entries'])
playliststart = self.params.get('playliststart', 1) - 1
playlistend = self.params.get('playlistend', -1)
if playlistend == -1:
entries = ie_result['entries'][playliststart:]
else:
entries = ie_result['entries'][playliststart:playlistend]
n_entries = len(entries)
self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
(ie_result['extractor'], playlist, n_all_entries, n_entries))
for i,entry in enumerate(entries,1):
self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
entry_result = self.process_ie_result(entry, False)
entry_result['playlist'] = playlist
entry_result['playlist_index'] = i + playliststart
#We must do the download here to correctly set the 'playlist' key
if download:
self.process_info(entry_result)
playlist_results.append(entry_result)
result = ie_result.copy()
result['entries'] = playlist_results
return result
def process_info(self, info_dict): def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor.""" """Process a single dictionary returned by an InfoExtractor."""
#We increment the download the download count here to match the previous behaviour.
self.increment_downloads()
info_dict['fulltitle'] = info_dict['title'] info_dict['fulltitle'] = info_dict['title']
if len(info_dict['title']) > 200: if len(info_dict['title']) > 200:
info_dict['title'] = info_dict['title'][:197] + u'...' info_dict['title'] = info_dict['title'][:197] + u'...'
@ -564,53 +666,14 @@ def download(self, url_list):
raise SameFileError(self.params['outtmpl']) raise SameFileError(self.params['outtmpl'])
for url in url_list: for url in url_list:
suitable_found = False
for ie in self._ies:
# Go to next InfoExtractor if not suitable
if not ie.suitable(url):
continue
# Warn if the _WORKING attribute is False
if not ie.working():
self.report_warning(u'the program functionality for this site has been marked as broken, '
u'and will probably not work. If you want to go on, use the -i option.')
# Suitable InfoExtractor found
suitable_found = True
# Extract information from URL and process it
try: try:
videos = ie.extract(url) #It also downloads the videos
except ExtractorError as de: # An error we somewhat expected videos = self.extract_info(url)
self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback()) except UnavailableVideoError:
break self.trouble(u'\nERROR: unable to download video')
except MaxDownloadsReached: except MaxDownloadsReached:
self.to_screen(u'[info] Maximum number of downloaded files reached.') self.to_screen(u'[info] Maximum number of downloaded files reached.')
raise raise
except Exception as e:
if self.params.get('ignoreerrors', False):
self.report_error(u'' + compat_str(e), tb=compat_str(traceback.format_exc()))
break
else:
raise
if len(videos or []) > 1 and self.fixed_template():
raise SameFileError(self.params['outtmpl'])
for video in videos or []:
video['extractor'] = ie.IE_NAME
try:
self.increment_downloads()
self.process_info(video)
except UnavailableVideoError:
self.to_stderr(u"\n")
self.report_error(u'unable to download video')
# Suitable InfoExtractor had been found; go to next URL
break
if not suitable_found:
self.report_error(u'no suitable InfoExtractor: %s' % url)
return self._download_retcode return self._download_retcode

View File

@ -144,6 +144,28 @@ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
self._downloader.to_screen(dump) self._downloader.to_screen(dump)
return webpage_bytes.decode(encoding, 'replace') return webpage_bytes.decode(encoding, 'replace')
#Methods for following #608
#They set the correct value of the '_type' key
def video_result(self, video_info):
"""Returns a video"""
video_info['_type'] = 'video'
return video_info
def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url}
return video_info
def playlist_result(self, entries, playlist_id=None, playlist_title=None):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
if playlist_id:
video_info['id'] = playlist_id
if playlist_title:
video_info['title'] = playlist_title
return video_info
class YoutubeIE(InfoExtractor): class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com.""" """Information extractor for youtube.com."""
@ -706,8 +728,7 @@ def _real_extract(self, url):
# Check if video comes from YouTube # Check if video comes from YouTube
mobj2 = re.match(r'^yt-(.*)$', video_id) mobj2 = re.match(r'^yt-(.*)$', video_id)
if mobj2 is not None: if mobj2 is not None:
self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
return
# Retrieve video webpage to extract further information # Retrieve video webpage to extract further information
request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
@ -1348,7 +1369,7 @@ def report_following_redirect(self, new_url):
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
def _test_redirect(self, url): def _test_redirect(self, url):
"""Check if it is a redirect, like url shorteners, in case restart chain.""" """Check if it is a redirect, like url shorteners, in case return the new url."""
class HeadRequest(compat_urllib_request.Request): class HeadRequest(compat_urllib_request.Request):
def get_method(self): def get_method(self):
return "HEAD" return "HEAD"
@ -1399,11 +1420,11 @@ def http_error_405(self, req, fp, code, msg, headers):
return False return False
self.report_following_redirect(new_url) self.report_following_redirect(new_url)
self._downloader.download([new_url]) return new_url
return True
def _real_extract(self, url): def _real_extract(self, url):
if self._test_redirect(url): return new_url = self._test_redirect(url)
if new_url: return [self.url_result(new_url)]
video_id = url.split('/')[-1] video_id = url.split('/')[-1]
try: try:
@ -1794,23 +1815,9 @@ def _real_extract(self, url):
page_num += 1 page_num += 1
videos = [v[1] for v in sorted(videos)] videos = [v[1] for v in sorted(videos)]
total = len(videos)
playliststart = self._downloader.params.get('playliststart', 1) - 1 url_results = [self.url_result(url) for url in videos]
playlistend = self._downloader.params.get('playlistend', -1) return [self.playlist_result(url_results, playlist_id)]
if playlistend == -1:
videos = videos[playliststart:]
else:
videos = videos[playliststart:playlistend]
if len(videos) == total:
self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
else:
self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
for video in videos:
self._downloader.download([video])
return
class YoutubeChannelIE(InfoExtractor): class YoutubeChannelIE(InfoExtractor):
@ -1860,9 +1867,9 @@ def _real_extract(self, url):
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
for id in video_ids: urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) url_entries = [self.url_result(url) for url in urls]
return return [self.playlist_result(url_entries, channel_id)]
class YoutubeUserIE(InfoExtractor): class YoutubeUserIE(InfoExtractor):
@ -1932,20 +1939,9 @@ def _real_extract(self, url):
pagenum += 1 pagenum += 1
all_ids_count = len(video_ids) urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
playliststart = self._downloader.params.get('playliststart', 1) - 1 url_results = [self.url_result(url) for url in urls]
playlistend = self._downloader.params.get('playlistend', -1) return [self.playlist_result(url_results, playlist_title = username)]
if playlistend == -1:
video_ids = video_ids[playliststart:]
else:
video_ids = video_ids[playliststart:playlistend]
self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
(username, all_ids_count, len(video_ids)))
for video_id in video_ids:
self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
class BlipTVUserIE(InfoExtractor): class BlipTVUserIE(InfoExtractor):
@ -2023,20 +2019,12 @@ def _real_extract(self, url):
pagenum += 1 pagenum += 1
all_ids_count = len(video_ids)
playliststart = self._downloader.params.get('playliststart', 1) - 1
playlistend = self._downloader.params.get('playlistend', -1)
if playlistend == -1:
video_ids = video_ids[playliststart:]
else:
video_ids = video_ids[playliststart:playlistend]
self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" % self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
(self.IE_NAME, username, all_ids_count, len(video_ids))) (self.IE_NAME, username, all_ids_count, len(video_ids)))
for video_id in video_ids: urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
self._downloader.download([u'http://blip.tv/'+video_id]) url_entries = [self.url_result(url) for url in urls]
return [self.playlist_result(url_entries, playlist_title = username)]
class DepositFilesIE(InfoExtractor): class DepositFilesIE(InfoExtractor):