Try to mitigate the problem of loading a fully sorted archive

Sorted archives turn the binary tree into a linked list and make
things horribly slow. This is an incomplete mitigation for this
issue.
This commit is contained in:
Jody Bruchon 2020-09-17 17:28:22 -04:00
parent 1de7ea76f8
commit 1d74d8d9f6

View File

@ -113,12 +113,14 @@ from .version import __version__
if compat_os_name == 'nt': if compat_os_name == 'nt':
import ctypes import ctypes
# Archive tree
class ArchiveTree(object): class ArchiveTree(object):
def __init__(self, line): def __init__(self, line):
self.left = None self.left = None
self.right = None self.right = None
self.line = line self.line = line
# Tree insertion
def at_insert(self, line): def at_insert(self, line):
# print("at_insert: ", line) # print("at_insert: ", line)
cur = self cur = self
@ -130,6 +132,7 @@ class ArchiveTree(object):
cur.left = ArchiveTree(line) cur.left = ArchiveTree(line)
return return
else: else:
# print("LEFT")
cur = cur.left cur = cur.left
continue continue
elif line > cur.line: elif line > cur.line:
@ -137,6 +140,7 @@ class ArchiveTree(object):
cur.right = ArchiveTree(line) cur.right = ArchiveTree(line)
return return
else: else:
# print("RIGHT")
cur = cur.right cur = cur.right
continue continue
else: else:
@ -410,16 +414,55 @@ class YoutubeDL(object):
"""Preload the archive, if any is specified""" """Preload the archive, if any is specified"""
def preload_download_archive(self): def preload_download_archive(self):
lines = []
fn = self.params.get('download_archive') fn = self.params.get('download_archive')
if fn is None: if fn is None:
return False return False
try: try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file: with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file: for line in archive_file:
self.archive.at_insert(line.strip()) lines.append(line.strip())
except IOError as ioe: except IOError as ioe:
if ioe.errno != errno.ENOENT: if ioe.errno != errno.ENOENT:
raise raise
lmax = len(lines)
if lmax >= 4:
# Populate binary search tree by splitting the archive list in half
# and then adding from the outside edges inward
# This mitigates the worst case where the archive has been sorted
ptrLL = 0
ptrLR = lmax // 2
ptrRL = ptrLR + 1
ptrRR = lmax - 1
inserted = 0
while True:
# print("ptrs: %d %d %d %d" % (ptrLL, ptrLR, ptrRL, ptrRR))
if ptrLR > ptrLL:
self.archive.at_insert(lines[ptrLR])
inserted += 1
ptrLR -= 1;
if ptrRL < ptrRR:
self.archive.at_insert(lines[ptrRL])
inserted += 1
ptrRL += 1;
if ptrLL < ptrLR:
self.archive.at_insert(lines[ptrLL])
inserted += 1
ptrLL += 1;
if ptrRR > ptrRL:
self.archive.at_insert(lines[ptrRR])
inserted += 1
ptrRR -= 1;
if ptrLL == ptrLR and ptrRL == ptrRR:
print("inserted: %d, lmax: %d" % (inserted, lmax))
break
elif lmax > 0:
# Skip multi-line logic for a single line
for idx in lines:
self.archive.at_insert(idx)
else:
# No lines were loaded
return False
return True return True
def check_deprecated(param, option, suggestion): def check_deprecated(param, option, suggestion):