From 41ea293897b307c495e63accccb91d4eae9a29f7 Mon Sep 17 00:00:00 2001 From: Jennifer Taylor Date: Thu, 12 Nov 2020 05:03:33 +0000 Subject: [PATCH] Faster, but less than ideal, large file compression as well as start byte GC. --- bemani/protocol/lz77.py | 82 +++++++++++++++++++++++++++++---------- bemani/tests/test_lz77.py | 10 +++++ 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/bemani/protocol/lz77.py b/bemani/protocol/lz77.py index 439511c..c36f0b7 100644 --- a/bemani/protocol/lz77.py +++ b/bemani/protocol/lz77.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Generator, List, Mapping, Optional, Set, Tuple +from typing import Generator, List, MutableMapping, Optional, Set, Tuple class LzException(Exception): @@ -206,6 +206,8 @@ class Lz77Compress: RING_LENGTH = 0x1000 + LOOSE_COMPRESS_THRESHOLD = 1024 * 512 + FLAG_COPY = 1 FLAG_BACKREF = 0 @@ -222,11 +224,32 @@ class Lz77Compress: self.eof: bool = False self.bytes_written: int = 0 self.ringlength: int = backref or self.RING_LENGTH - self.locations: Mapping[int, Set[int]] = defaultdict(set) - self.starts: Mapping[bytes, Set[int]] = defaultdict(set) + self.locations: MutableMapping[int, Set[int]] = defaultdict(set) + self.starts: MutableMapping[bytes, Set[int]] = defaultdict(set) self.last_start: Tuple[int, int, int] = (0, 0, 0) - def __ring_write(self, bytedata: bytes) -> None: + if len(data) > self.LOOSE_COMPRESS_THRESHOLD: + self.__ring_write = self.__ring_write_starts_only + else: + self.__ring_write = self.__ring_write_both + + def __ring_write_starts_only(self, bytedata: bytes) -> None: + """ + Write bytes into the backref ring. + + Parameters: + byte - A byte to be written at the current write offset + """ + for byte in bytedata: + # Update the start locations hashmap if we're past the beginning + self.last_start = (self.last_start[1], self.last_start[2], byte) + if self.bytes_written >= 2: + self.starts[bytes(self.last_start)].add(self.bytes_written - 2) + + # Keep track of the fact that we wrote this byte. + self.bytes_written += 1 + + def __ring_write_both(self, bytedata: bytes) -> None: """ Write bytes into the backref ring. @@ -241,6 +264,8 @@ class Lz77Compress: # Update the rest of the location hashmaps self.locations[byte].add(self.bytes_written) + + # Keep track of the fact that we wrote this byte. self.bytes_written += 1 def compress_bytes(self) -> Generator[bytes, None, None]: @@ -290,10 +315,13 @@ class Lz77Compress: # Iterate over all spots where the first byte equals, and is in range. earliest = max(0, self.bytes_written - (self.ringlength - 1)) - possible_backref_locations: List[int] = [ - absolute_pos for absolute_pos in self.starts[self.data[self.read_pos:(self.read_pos + 3)]] + index = self.data[self.read_pos:(self.read_pos + 3)] + updated_backref_locations: Set[int] = set( + absolute_pos for absolute_pos in self.starts[index] if absolute_pos >= earliest - ] + ) + self.starts[index] = updated_backref_locations + possible_backref_locations: List[int] = list(updated_backref_locations) # Output the data as a copy if we couldn't find a backref if not possible_backref_locations: @@ -311,26 +339,40 @@ class Lz77Compress: # we're going to write at least these three bytes, so append it to the # output buffer. start_write_size = self.bytes_written - self.__ring_write(self.data[self.read_pos:(self.read_pos + 3)]) + self.__ring_write(index) copy_amount = 3 - for _ in range(backref_amount - 3): - # Check our existing locations to figure out if we still have - # longest prefixes. - locations = self.locations[self.data[self.read_pos + copy_amount]] + while copy_amount < (backref_amount): + # First, let's see if we have any 3-wide chunks to consume. + index = self.data[(self.read_pos + copy_amount):(self.read_pos + copy_amount + 3)] + locations = self.starts[index] new_backref_locations: List[int] = [ absolute_pos for absolute_pos in possible_backref_locations if absolute_pos + copy_amount in locations ] - # If we have no longest prefixes, that means that any of the - # previous prefixes are good enough. - if not new_backref_locations: - break + if new_backref_locations: + # Mark that we're copying an extra byte from the backref. + self.__ring_write(index) + copy_amount += 3 + possible_backref_locations = new_backref_locations + else: + # Check our existing locations to figure out if we still have + # longest prefixes of 1 or 2 left. + locations = self.locations[self.data[self.read_pos + copy_amount]] + new_backref_locations = [ + absolute_pos for absolute_pos in possible_backref_locations + if absolute_pos + copy_amount in locations + ] - # Mark that we're copying an extra byte from the backref. - self.__ring_write(self.data[(self.read_pos + copy_amount):(self.read_pos + copy_amount + 1)]) - copy_amount += 1 - possible_backref_locations = new_backref_locations + # If we have no longest prefixes, that means that any of the + # previous prefixes are good enough. + if not new_backref_locations: + break + + # Mark that we're copying an extra byte from the backref. + self.__ring_write(self.data[(self.read_pos + copy_amount):(self.read_pos + copy_amount + 1)]) + copy_amount += 1 + possible_backref_locations = new_backref_locations # Now that we have a list of candidates, arbitrarily pick the # first one as our candidate and output it. diff --git a/bemani/tests/test_lz77.py b/bemani/tests/test_lz77.py index fee7144..3e3b2b8 100644 --- a/bemani/tests/test_lz77.py +++ b/bemani/tests/test_lz77.py @@ -53,6 +53,16 @@ class TestLz77RealCompressor(unittest.TestCase): decompresseddata = lz77.decompress(compresseddata) self.assertEqual(data, decompresseddata) + def test_huge_data_random(self) -> None: + lz77 = Lz77() + data = bytes([random.randint(0, 255) for _ in range(1 * 1024 * 1024)]) + + compresseddata = lz77.compress(data) + self.assertNotEqual(data, compresseddata) + + decompresseddata = lz77.decompress(compresseddata) + self.assertEqual(data, decompresseddata) + def test_declaration(self) -> None: lz77 = Lz77() data = get_fixture("declaration.txt")