Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.
This commit is contained in:
Jaime Marquínez Ferrándiz 2015-02-15 18:03:41 +01:00
parent 8fb474fb17
commit a504ced097
7 changed files with 121 additions and 37 deletions

View File

@ -27,15 +27,23 @@ class BaseTestSubtitles(unittest.TestCase):
def setUp(self): def setUp(self):
self.DL = FakeYDL() self.DL = FakeYDL()
self.ie = self.IE(self.DL) self.ie = self.IE()
self.DL.add_info_extractor(self.ie)
def getInfoDict(self): def getInfoDict(self):
info_dict = self.ie.extract(self.url) info_dict = self.DL.extract_info(self.url, download=False)
return info_dict return info_dict
def getSubtitles(self): def getSubtitles(self):
info_dict = self.getInfoDict() info_dict = self.getInfoDict()
return info_dict['subtitles'] subtitles = info_dict['subtitles']
if not subtitles:
return subtitles
for sub_info in subtitles.values():
if sub_info.get('data') is None:
uf = self.DL.urlopen(sub_info['url'])
sub_info['data'] = uf.read().decode('utf-8')
return dict((l, sub_info['data']) for l, sub_info in subtitles.items())
class TestYoutubeSubtitles(BaseTestSubtitles): class TestYoutubeSubtitles(BaseTestSubtitles):
@ -176,7 +184,7 @@ class TestTedSubtitles(BaseTestSubtitles):
def test_no_writesubtitles(self): def test_no_writesubtitles(self):
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(subtitles, None) self.assertFalse(subtitles)
def test_subtitles(self): def test_subtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
@ -196,18 +204,10 @@ def test_allsubtitles(self):
self.assertTrue(len(subtitles.keys()) >= 28) self.assertTrue(len(subtitles.keys()) >= 28)
def test_list_subtitles(self): def test_list_subtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict() info_dict = self.getInfoDict()
self.assertEqual(info_dict, None) self.assertEqual(info_dict, None)
def test_automatic_captions(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) == 0)
def test_multiple_langs(self): def test_multiple_langs(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
langs = ['es', 'fr', 'de'] langs = ['es', 'fr', 'de']

View File

@ -154,7 +154,7 @@ class YoutubeDL(object):
allsubtitles: Downloads all the subtitles of the video allsubtitles: Downloads all the subtitles of the video
(requires writesubtitles or writeautomaticsub) (requires writesubtitles or writeautomaticsub)
listsubtitles: Lists all available subtitles for the video listsubtitles: Lists all available subtitles for the video
subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) subtitlesformat: The format code for subtitles
subtitleslangs: List of languages of the subtitles to download subtitleslangs: List of languages of the subtitles to download
keepvideo: Keep the video file after post-processing keepvideo: Keep the video file after post-processing
daterange: A DateRange object, download only if the upload_date is in the range. daterange: A DateRange object, download only if the upload_date is in the range.
@ -1019,6 +1019,11 @@ def process_video_result(self, info_dict, download=True):
info_dict['timestamp']) info_dict['timestamp'])
info_dict['upload_date'] = upload_date.strftime('%Y%m%d') info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
if self.params.get('listsubtitles', False):
self.list_subtitles(info_dict['id'], info_dict.get('subtitles'))
return
info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles'))
# This extractors handle format selection themselves # This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']: if info_dict['extractor'] in ['Youku']:
if download: if download:
@ -1147,6 +1152,53 @@ def process_video_result(self, info_dict, download=True):
info_dict.update(formats_to_download[-1]) info_dict.update(formats_to_download[-1])
return info_dict return info_dict
def process_subtitles(self, video_id, available_subs):
"""Select the requested subtitles and their format"""
if not available_subs:
return available_subs
if self.params.get('allsubtitles', False):
requested_langs = available_subs.keys()
else:
if self.params.get('subtitleslangs', False):
requested_langs = self.params.get('subtitleslangs')
elif 'en' in available_subs:
requested_langs = ['en']
else:
requested_langs = [list(available_subs.keys())[0]]
formats_query = self.params.get('subtitlesformat', 'best')
formats_preference = formats_query.split('/') if formats_query else []
subs = {}
for lang in requested_langs:
formats = available_subs.get(lang)
if formats is None:
self.report_warning('%s subtitles not available for %s' % (lang, video_id))
continue
if isinstance(formats, compat_str):
# TODO: convert all IE with subtitles support to the new format
# and remove this
subs[lang] = {
'ext': formats_preference[0],
'data': formats,
}
continue
for ext in formats_preference:
if ext == 'best':
f = formats[-1]
break
matches = list(filter(lambda f: f['ext'] == ext, formats))
if matches:
f = matches[-1]
break
else:
f = formats[-1]
self.report_warning(
'No subtitle format found matching "%s" for language %s, '
'using %s' % (formats_query, lang, f['ext']))
subs[lang] = f
return subs
def process_info(self, info_dict): def process_info(self, info_dict):
"""Process a single resolved IE result.""" """Process a single resolved IE result."""
@ -1253,10 +1305,17 @@ def process_info(self, info_dict):
# subtitles download errors are already managed as troubles in relevant IE # subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE # that way it will silently go on when used with unsupporting IE
subtitles = info_dict['subtitles'] subtitles = info_dict['subtitles']
sub_format = self.params.get('subtitlesformat', 'srt') for sub_lang, sub_info in subtitles.items():
for sub_lang in subtitles.keys(): sub_format = sub_info['ext']
sub = subtitles[sub_lang] if sub_info.get('data') is not None:
if sub is None: sub_data = sub_info['data']
else:
try:
uf = self.urlopen(sub_info['url'])
sub_data = uf.read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, compat_str(err)))
continue continue
try: try:
sub_filename = subtitles_filename(filename, sub_lang, sub_format) sub_filename = subtitles_filename(filename, sub_lang, sub_format)
@ -1265,7 +1324,7 @@ def process_info(self, info_dict):
else: else:
self.to_screen('[info] Writing video subtitles to: ' + sub_filename) self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
subfile.write(sub) subfile.write(sub_data)
except (OSError, IOError): except (OSError, IOError):
self.report_error('Cannot write subtitles file ' + sub_filename) self.report_error('Cannot write subtitles file ' + sub_filename)
return return
@ -1586,6 +1645,18 @@ def list_thumbnails(self, info_dict):
['ID', 'width', 'height', 'URL'], ['ID', 'width', 'height', 'URL'],
[[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
def list_subtitles(self, video_id, subtitles):
if not subtitles:
self.to_screen('%s has no subtitles' % video_id)
return
header_line = 'Language formats'
sub_lines = [
'%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
for lang, formats in subtitles.items()]
self.to_screen(
'Available subtitles for %s:\n%s\n%s' %
(video_id, header_line, '\n'.join(sub_lines)))
def urlopen(self, req): def urlopen(self, req):
""" Start an HTTP download """ """ Start an HTTP download """

View File

@ -226,7 +226,6 @@ def _real_main(argv=None):
if opts.embedsubtitles: if opts.embedsubtitles:
postprocessors.append({ postprocessors.append({
'key': 'FFmpegEmbedSubtitle', 'key': 'FFmpegEmbedSubtitle',
'subtitlesformat': opts.subtitlesformat,
}) })
if opts.xattrs: if opts.xattrs:
postprocessors.append({'key': 'XAttrMetadata'}) postprocessors.append({'key': 'XAttrMetadata'})

View File

@ -151,8 +151,14 @@ class InfoExtractor(object):
If not explicitly set, calculated from timestamp. If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader. uploader_id: Nickname or id of the video uploader.
location: Physical location where the video was filmed. location: Physical location where the video was filmed.
subtitles: The subtitle file contents as a dictionary in the format subtitles: The available subtitles as a dictionary in the format
{language: subtitles}. {language: subformats}. "subformats" is a list sorted from
lower to higher preference, each element is a dictionary
with the "ext" entry and one of:
* "data": The subtitles file contents
* "url": A url pointing to the subtitles file
Note: YoutubeDL.extract_info will get the requested
format and replace the "subformats" list with it.
duration: Length of the video in seconds, as an integer. duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform. view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video like_count: Number of positive ratings of the video
@ -993,6 +999,16 @@ def is_suitable(self, age_limit):
any_restricted = any_restricted or is_restricted any_restricted = any_restricted or is_restricted
return not any_restricted return not any_restricted
def extract_subtitles(self, *args, **kwargs):
subtitles = {}
list_subtitles = self._downloader.params.get('listsubtitles')
if self._downloader.params.get('writesubtitles', False) or list_subtitles:
subtitles.update(self._get_subtitles(*args, **kwargs))
return subtitles
def _get_subtitles(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses")
class SearchInfoExtractor(InfoExtractor): class SearchInfoExtractor(InfoExtractor):
""" """

View File

@ -3,14 +3,14 @@
import json import json
import re import re
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
) )
class TEDIE(SubtitlesInfoExtractor): class TEDIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?P<proto>https?://) (?P<proto>https?://)
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
@ -165,9 +165,6 @@ def _talk_info(self, url, video_name):
video_id = compat_str(talk_info['id']) video_id = compat_str(talk_info['id'])
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, talk_info) video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, talk_info)
return
thumbnail = talk_info['thumb'] thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'): if not thumbnail.startswith('http'):
@ -183,13 +180,18 @@ def _talk_info(self, url, video_name):
'duration': talk_info.get('duration'), 'duration': talk_info.get('duration'),
} }
def _get_available_subtitles(self, video_id, talk_info): def _get_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages: if languages:
sub_lang_list = {} sub_lang_list = {}
for l in languages: for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) sub_lang_list[l] = [
sub_lang_list[l] = url {
'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
'ext': ext,
}
for ext in ['ted', 'srt']
]
return sub_lang_list return sub_lang_list
else: else:
self._downloader.report_warning('video doesn\'t have subtitles') self._downloader.report_warning('video doesn\'t have subtitles')

View File

@ -387,8 +387,8 @@ def _hide_login_info(opts):
help='lists all available subtitles for the video') help='lists all available subtitles for the video')
subtitles.add_option( subtitles.add_option(
'--sub-format', '--sub-format',
action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', action='store', dest='subtitlesformat', metavar='FORMAT', default='best',
help='subtitle format (default=srt) ([sbv/vtt] youtube only)') help='subtitle format, accepts formats preference, for example: "ass/srt/best"')
subtitles.add_option( subtitles.add_option(
'--sub-lang', '--sub-langs', '--srt-lang', '--sub-lang', '--sub-langs', '--srt-lang',
action='callback', dest='subtitleslangs', metavar='LANGS', type='str', action='callback', dest='subtitleslangs', metavar='LANGS', type='str',

View File

@ -453,10 +453,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
'zu': 'zul', 'zu': 'zul',
} }
def __init__(self, downloader=None, subtitlesformat='srt'):
super(FFmpegEmbedSubtitlePP, self).__init__(downloader)
self._subformat = subtitlesformat
@classmethod @classmethod
def _conver_lang_code(cls, code): def _conver_lang_code(cls, code):
"""Convert language code from ISO 639-1 to ISO 639-2/T""" """Convert language code from ISO 639-1 to ISO 639-2/T"""
@ -472,7 +468,7 @@ def run(self, information):
sub_langs = [key for key in information['subtitles']] sub_langs = [key for key in information['subtitles']]
filename = information['filepath'] filename = information['filepath']
input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()]
opts = [ opts = [
'-map', '0', '-map', '0',