Keep download archive in memory for better performance

The old behavior was to open and scan the entire archive file for
every single video download. This resulted in horrible performance
for archives of any remotely large size, especially since all new
video IDs are appended to the end of the archive. For anyone who
uses the archive feature to maintain archives of entire video
playlists or channels, this meant that all such lists with newer
downloads would have to scan close to the end of the archive file
before the potential download was rejected. For archives with tens
of thousands of lines, this easily resulted in millions of line
reads and checks over the course of scanning a single channel or
playlist that had been seen previously.

The new behavior in this commit is to preload the archive file
into a binary search tree and scan the tree instead of constantly
scanning the file on disk for every file. When a new download is
appended to the archive file, it is also added to this tree. The
performance is massively better using this strategy over the more
"naive" line-by-line archive file parsing strategy.

The only negative consequence of this change is that the archive
in memory will not be synchronized with the archive file on disk.
Running multiple instances of the program at the same time that
all use the same archive file may result in duplicate archive
entries or duplicated downloads. This is unlikely to be a serious
issue for the vast majority of users. If the instances are not
likely to try to download identical video IDs then this should
not be a problem anyway; for example, having two instances pull
two completely different YouTube channels at once should be fine.

Signed-off-by: Jody Bruchon <jody@jodybruchon.com>
This commit is contained in:
Jody Bruchon 2020-09-17 14:22:07 -04:00
parent 7ac0ba50ce
commit ecdec1913f

View File

@ -113,6 +113,43 @@
if compat_os_name == 'nt': if compat_os_name == 'nt':
import ctypes import ctypes
class ArchiveTree(object):
def __init__(self, line):
self.left = None
self.right = None
self.line = line
def at_insert(self, line):
print("at_insert: ", line)
if self.line:
if line < self.line:
if self.left is None:
self.left = ArchiveTree(line)
else:
self.left.at_insert(line)
elif line > self.line:
if self.right is None:
self.right = ArchiveTree(line)
else:
self.right.at_insert(line)
else:
self.line = line
def at_exist(self, line):
print("at_exist: ", line)
if self.line is None:
return False
if line < self.line:
if self.left is None:
return False
return self.left.at_exist(line)
elif line > self.line:
if self.right is None:
return False
return self.right.at_exist(line)
else:
return True
class YoutubeDL(object): class YoutubeDL(object):
"""YoutubeDL class. """YoutubeDL class.
@ -359,6 +396,21 @@ def __init__(self, params=None, auto_init=True):
} }
self.params.update(params) self.params.update(params)
self.cache = Cache(self) self.cache = Cache(self)
self.archive = ArchiveTree(None)
"""Preload the archive, if any is specified"""
def preload_download_archive(self):
fn = self.params.get('download_archive')
if fn is None:
return False
try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
self.archive.at_insert(line.strip())
except IOError as ioe:
if ioe.errno != errno.ENOENT:
raise
return True
def check_deprecated(param, option, suggestion): def check_deprecated(param, option, suggestion):
if self.params.get(param) is not None: if self.params.get(param) is not None:
@ -367,6 +419,8 @@ def check_deprecated(param, option, suggestion):
return True return True
return False return False
preload_download_archive(self)
if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
if self.params.get('geo_verification_proxy') is None: if self.params.get('geo_verification_proxy') is None:
self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
@ -722,7 +776,7 @@ def prepare_filename(self, info_dict):
return None return None
def _match_entry(self, info_dict, incomplete): def _match_entry(self, info_dict, incomplete):
""" Returns None iff the file should be downloaded """ """ Returns None if the file should be downloaded """
video_title = info_dict.get('title', info_dict.get('id', 'video')) video_title = info_dict.get('title', info_dict.get('id', 'video'))
if 'title' in info_dict: if 'title' in info_dict:
@ -2142,15 +2196,7 @@ def in_download_archive(self, info_dict):
if not vid_id: if not vid_id:
return False # Incomplete video information return False # Incomplete video information
try: return self.archive.at_exist(vid_id)
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
if line.strip() == vid_id:
return True
except IOError as ioe:
if ioe.errno != errno.ENOENT:
raise
return False
def record_download_archive(self, info_dict): def record_download_archive(self, info_dict):
fn = self.params.get('download_archive') fn = self.params.get('download_archive')
@ -2160,6 +2206,7 @@ def record_download_archive(self, info_dict):
assert vid_id assert vid_id
with locked_file(fn, 'a', encoding='utf-8') as archive_file: with locked_file(fn, 'a', encoding='utf-8') as archive_file:
archive_file.write(vid_id + '\n') archive_file.write(vid_id + '\n')
self.archive.at_insert(vid_id)
@staticmethod @staticmethod
def format_resolution(format, default='unknown'): def format_resolution(format, default='unknown'):