mymc/lzari.py

#
# lzari.py
#
# By Ross Ridge
#

"""
Implementation of Haruhiko Okumura's LZARI data compression algorithm
in Python.  Largely based on LZARI.C, one key difference is the use of
a two level dicitionary look up during compression rather than
LZARI.C's binary search tree.
"""

_SCCS_ID = "@(#) mysc lzari.py 1.5 08/02/05 16:00:34\n"

import sys
import array
import binascii
import string
import time
from bisect import bisect_right
from math import log

try:
	import ctypes
	import mymcsup
except ImportError:
	mymcsup = None

hexlify = binascii.hexlify

__ALL__ = ['lzari_codec', 'string_to_bit_array', 'bit_array_to_string']

#
# Fundamental constants of the LZARI compression alogorithm.
#
# Changing any of these values will create an incompatible implementation.
#

HIST_LEN = 4096
MIN_MATCH_LEN = 3
MAX_MATCH_LEN = 60

ARITH_BITS = 15
QUADRANT1 = 1 << ARITH_BITS
QUADRANT2 = QUADRANT1 * 2
QUADRANT3 = QUADRANT1 * 3
QUADRANT4 = QUADRANT1 * 4
MAX_CUM = QUADRANT1 - 1
MAX_CHAR = (256 + MAX_MATCH_LEN - MIN_MATCH_LEN + 1)

#
# Other constants specific to this implementation
#

MAX_SUFFIX_CHAIN = 50	# limit on how many identical suffixes to try to match

#def debug(value, msg):
#	print "@@@ %s %04x" % (msg, value)
debug = lambda value, msg: None

_tr_16 = string.maketrans("0123456789abcdef",
			  "\x00\x01\x02\x03"
			  "\x10\x11\x12\x13"
			  "\x20\x21\x22\x23"
			  "\x30\x31\x32\x33")
_tr_4 = string.maketrans("0123",
			 "\x00\x01"
			 "\x10\x11")
_tr_2 = string.maketrans("01", "\x00\x01")

def string_to_bit_array(s):
	"""Convert a string to an array containing a sequence of bits."""
	s = binascii.hexlify(s).translate(_tr_16)
	s = binascii.hexlify(s).translate(_tr_4)
	s = binascii.hexlify(s).translate(_tr_2)
	a = array.array('B', s)
	return a

_tr_rev_2 = string.maketrans("\x00\x01", "01")
_tr_rev_4 = string.maketrans("\x00\x01"
			     "\x10\x11",
			     "0123")
_tr_rev_16 = string.maketrans("\x00\x01\x02\x03"
			      "\x10\x11\x12\x13"
			      "\x20\x21\x22\x23"
			      "\x30\x31\x32\x33",
			      "0123456789abcdef")
def bit_array_to_string(a):
	"""Convert an array containing a sequence of bits to a string."""
	remainder = len(a) % 8
	if remainder != 0:
		a.fromlist([0] * (8 - remainder))
	s = a.tostring()
	s = binascii.unhexlify(s.translate(_tr_rev_2))
	s = binascii.unhexlify(s.translate(_tr_rev_4))
	return binascii.unhexlify(s.translate(_tr_rev_16))

def _match(src, pos, hpos, mlen, end):
	mlen += 1
	if not src.startswith(src[hpos : hpos + mlen], pos):
		return None
	for i in range(mlen, end):
		if src[pos + i] != src[hpos + i]:
			return i
	return end

def _rehash_table2(src, chars, head, next, next2, hist_invalid):
	p = head
	table2 = {}
	l = []
	while p > hist_invalid:
		l.append(p)
		p = next[p % HIST_LEN]
	l.reverse()
	for p in l:
		p2 = p + MIN_MATCH_LEN
		key2 = src[p2 : p2 + chars]
		head2 = table2.get(key2, hist_invalid)
		next2[p % HIST_LEN] = head2
		table2[key2] = p
	return table2

class lzari_codec(object):
	# despite the name this does not implement a codec compatible
	# with Python's codec system

	def init(self, decode):
		self.high = QUADRANT4
		self.low = 0
		if decode:
			self.code = 0
			# reverse the order of sym_cum so bisect_right() can
			# be used for faster searching
			self.sym_cum = range(0, MAX_CHAR + 1)
		else:
			self.shifts = 0
			self.char_to_symbol = range(1, MAX_CHAR + 1)
			self.sym_cum = range(MAX_CHAR, -1, -1)
			self.next_table = [None] * HIST_LEN
			self.next2_table = [None] * HIST_LEN
			self.suffix_table = {}

		self.symbol_to_char = [0] + range(MAX_CHAR)
		self.sym_freq = [0] + [1] * MAX_CHAR
		self.position_cum = [0] * (HIST_LEN + 1)
		a = 0
		for i in range(HIST_LEN, 0, -1):
			a =  a + 10000 / (200 + i)
			self.position_cum[i - 1] = a

	def search(self, table, x):
		c = 1
	        s = len(table) - 1
		while True:
			a = (s + c) / 2
			if table[a] <= x:
				s = a
			else:
				c = a + 1
			if c >= s:
				break
		return c

	def update_model_decode(self, symbol):
		# A compatible implemention to the one used while compressing.

		sym_freq = self.sym_freq
		sym_cum = self.sym_cum

		if self.sym_cum[MAX_CHAR] >= MAX_CUM:
			c = 0
			for i in range(MAX_CHAR, 0, -1):
				self.sym_cum[MAX_CHAR - i] = c
				a = (self.sym_freq[i] + 1) / 2
				self.sym_freq[i] = a
				c += a
			self.sym_cum[MAX_CHAR] = c
		freq = sym_freq[symbol]
		new_symbol = symbol
		while self.sym_freq[new_symbol - 1] == freq:
		        new_symbol -= 1
		# new_symbol = sym_freq.index(freq)
		if new_symbol != symbol:
			symbol_to_char = self.symbol_to_char
		        swap_char = symbol_to_char[new_symbol]
			char = symbol_to_char[symbol]
			symbol_to_char[new_symbol] = char
			symbol_to_char[symbol] = swap_char
		sym_freq[new_symbol] = freq + 1
		for i in range(MAX_CHAR - new_symbol + 1, MAX_CHAR + 1):
			sym_cum[i] += 1

	def update_model_encode(self, symbol):
		sym_freq = self.sym_freq
		sym_cum = self.sym_cum

	        if sym_cum[0] >= MAX_CUM:
			c = 0
			for i in range(MAX_CHAR, 0, -1):
				sym_cum[i] = c
				a = (sym_freq[i] + 1) / 2
				sym_freq[i] = a
				c += a
			sym_cum[0] = c
		freq = sym_freq[symbol]
		new_symbol = symbol
		while sym_freq[new_symbol - 1] == freq:
		        new_symbol -= 1
		if new_symbol != symbol:
			debug(new_symbol, "a")
		        swap_char = self.symbol_to_char[new_symbol]
			char = self.symbol_to_char[symbol]
			self.symbol_to_char[new_symbol] = char
			self.symbol_to_char[symbol] = swap_char
			self.char_to_symbol[char] = new_symbol
			self.char_to_symbol[swap_char] = symbol
		sym_freq[new_symbol] += 1
		for i in range(new_symbol):
			sym_cum[i] += 1

	def decode_char(self):
		high = self.high
		low = self.low
		code = self.code
		sym_cum = self.sym_cum

		_range = high - low
		max_cum_freq = sym_cum[MAX_CHAR]
		n = ((code - low + 1) * max_cum_freq - 1) / _range
		i = bisect_right(sym_cum, n, 1)
		high = low + sym_cum[i] * _range / max_cum_freq
		low += sym_cum[i - 1] * _range / max_cum_freq
		symbol = MAX_CHAR + 1 - i

		while True:
			if low < QUADRANT2:
				if low < QUADRANT1 or high > QUADRANT3:
					if high > QUADRANT2:
						break
				else:
					low -= QUADRANT1
					code -= QUADRANT1
					high -= QUADRANT1
			else:
				low -= QUADRANT2
				code -= QUADRANT2
				high -= QUADRANT2
			low *= 2
			high *= 2
			code = code * 2 + self.in_iter()

		ret = self.symbol_to_char[symbol]
		self.high = high
		self.low = low
		self.code = code
		self.update_model_decode(symbol)
		return ret

	def decode_position(self):
		_range = self.high - self.low
		max_cum = self.position_cum[0]
		pos = self.search(self.position_cum,
				  ((self.code - self.low + 1)
				   * max_cum - 1) / _range) - 1
		self.high = (self.low +
			     self.position_cum[pos] * _range / max_cum)
		self.low += self.position_cum[pos + 1] * _range / max_cum
		while True:
			if self.low < QUADRANT2:
				if (self.low < QUADRANT1
				    or self.high > QUADRANT3):
					if self.high > QUADRANT2:
						return pos
 				else:
					self.low -= QUADRANT1
					self.code -= QUADRANT1
					self.high -= QUADRANT1
			else:
				self.low -= QUADRANT2
				self.code -= QUADRANT2
				self.high -= QUADRANT2
			self.low *= 2
			self.high *= 2
			self.code = self.in_iter() + self.code * 2

	def add_suffix_1(self, pos, find):
		# naive implemention used for testing

		if not find:
			return (None, 0)
		src = self.src
		mlen = min(1000, self.max_match, len(src) - pos)
		hist_start = max(pos - HIST_LEN, 0)
		while mlen >= MIN_MATCH_LEN:
			i = src.rfind(src[pos : pos + mlen], hist_start, pos)
			if i != -1:
				assert (src[pos : pos + mlen]
					== src[i: i + mlen])
				return (i, mlen)
			mlen -= 1
		return (None, -1)

	def add_suffix_2(self, pos, find):
		# a two level dictionary look up that leverages Python's
		# built-in dicts to get something that's hopefully faster
		# than implementing binary trees in completely in Python.

		src = self.src
		suffix_table = self.suffix_table
		max_match = min(self.max_match, len(src) - pos)

		mlen = -1
		mpos = None

		hist_invalid = pos - HIST_LEN - 1
		modpos = pos % HIST_LEN
		pos2 = pos + MIN_MATCH_LEN

		key = src[pos : pos2]
		a = suffix_table.get(key)
		if a != None:
			next = self.next_table
			next2 = self.next2_table

			[count, head, table2, chars] = a

			pos3 = pos2 + chars
			key2 = src[pos2 : pos3]
			min_match2 = MIN_MATCH_LEN + chars
			if find:
				p = table2.get(key2, hist_invalid)
				maxmlen = max_match - min_match2
				while p > hist_invalid and mlen != maxmlen:
					p3 = p + min_match2
					if mpos == None and p3 <= pos:
						mpos = p
						mlen = 0
					if p3 >= pos:
						p = next2[p % HIST_LEN]
						continue
					rlen = _match(src, pos3, p3, mlen,
						      min(maxmlen, pos - p3))
					if rlen != None:
						mpos = p
						mlen = rlen
					p = next2[p % HIST_LEN]
			if mpos != None:
				mlen += min_match2
			elif find:
				p = head
				maxmlen = min(chars, max_match - MIN_MATCH_LEN)
				i = 0
				while (p > hist_invalid and i < 50000
				       and mlen < maxmlen):
					assert i < count
					i += 1
					p2 = p + MIN_MATCH_LEN
					l2 = pos - p2
					if mpos == None and l2 >= 0:
						mpos = p
						mlen = 0
					if l2 <= 0:
						p = next[p % HIST_LEN]
						continue
					if l2 > maxmlen:
						l2 = maxmlen
					m = mlen + 1
					if src.startswith(src[p2 : p2 + m],
							  pos2):
						mpos = p
						for j in range(m, l2):
							if (src[pos2 + j]
							    != src[p2 + j]):
								mlen = j
								break
						else:
							mlen = l2
					#rlen = _match(src, pos2, p2, mlen, l2)
					#if rlen != None:
					#	mpos = p
					#	mlen = rlen
					p = next[p % HIST_LEN]

				if mpos != None:
					mlen += MIN_MATCH_LEN

			count += 1
			new_chars = int(log(count, 2))
			# new_chars = 50
			new_chars = min(new_chars, max_match - MIN_MATCH_LEN)
			if new_chars > chars:
				chars = new_chars
				table2 = _rehash_table2(src, chars, head,
							next, next2,
							hist_invalid)

			next[modpos] = head
			head = pos

			key2 = src[pos2 : pos2 + chars]
			head2 = table2.get(key2, hist_invalid)
			next2[modpos] = head2
			table2[key2] = pos

			a[0] = count
			a[1] = head
			a[2] = table2
			a[3] = chars
		else:
			self.next_table[modpos] = hist_invalid
			self.next2_table[modpos] = hist_invalid
			key2 = ""
			# key2 = src[pos2 : pos2 + 1]
			suffix_table[key] = [1, pos, {key2: pos}, len(key2)]

		p = pos - HIST_LEN
		if p >= 0:
			p2 = p + MIN_MATCH_LEN
			key = src[p : p2]
			a = suffix_table[key]
			(count, head, table2, chars) = a
			count -= 1
			if count == 0:
				assert head == p
				del suffix_table[key]
			else:
				key2 = src[p2 : p2 + chars]
				if table2[key2] == p:
					del table2[key2]
				a[0] = count
		assert (mpos == None
			or src[pos : pos + mlen] == src[mpos : mpos + mlen])
		return (mpos, mlen)

	def _add_suffix(self, pos, find):
		r = self.add_suffix_2(pos, find)
		start_pos = self.start_pos
		if find and r[0] != None:
			print ("%4d %02x %4d %2d"
			       % (pos - start_pos, ord(self.src[pos]),
				  r[0] - start_pos, r[1]))
		else:
			print ("%4d %02x"
				       % (pos - start_pos, ord(self.src[pos])))
		return r

	add_suffix = add_suffix_2

	def output_bit(self, bit):
		self.append_bit(bit)
		bit ^= 1
		for i in range(self.shifts):
			self.append_bit(bit)
		self.shifts = 0

	def encode_char(self, char):
		low = self.low
		high = self.high
		sym_cum = self.sym_cum

		symbol = self.char_to_symbol[char]
		range = high - low

		high = low + range * sym_cum[symbol - 1] / sym_cum[0]
		low += range * sym_cum[symbol] / sym_cum[0]
		debug(high, "high");
		debug(low, "low");
		while True:
			if high <= QUADRANT2:
				self.output_bit(0)
			elif low >= QUADRANT2:
				self.output_bit(1)
				low -= QUADRANT2
				high -= QUADRANT2
			elif low >= QUADRANT1 and high <= QUADRANT3:
				self.shifts += 1
				low -= QUADRANT1
				high -= QUADRANT1
			else:
				break
			low *= 2
			high *= 2
		self.low = low
		self.high = high
		self.update_model_encode(symbol)

	def encode_position(self, position):
		position_cum = self.position_cum
		low = self.low
		high = self.high

		range = high - low
		high = low + range * position_cum[position] / position_cum[0]
		low += range * position_cum[position + 1] / position_cum[0]

		debug(high, "high");
		debug(low, "low");
		while True:
			if high <= QUADRANT2:
				self.output_bit(0)
			elif low >= QUADRANT2:
				self.output_bit(1)
				low -= QUADRANT2
				high -= QUADRANT2
			elif low >= QUADRANT1 and high <= QUADRANT3:
				self.shifts += 1
				low -= QUADRANT1
				high -= QUADRANT1
			else:
				break
			low *= 2
			high *= 2

		self.low = low
		self.high = high

	def encode(self, src, progress = None):
		"""Compress a string."""

		length = len(src)
		if length == 0:
			return ""

		out_array = array.array('B')
		self.out_array = out_array
		self.append_bit = out_array.append

		self.init(False)

		max_match = min(MAX_MATCH_LEN, length)
		self.max_match = max_match
		self.src = src = "\x20" * max_match + src

		in_length = len(src)

		self.start_pos = max_match

		for in_pos in range(max_match):
			self.add_suffix(in_pos, False)
		in_pos += 1
		last_percent = -1
		while in_pos < in_length:
			if progress:
				percent = (in_pos - max_match) * 100 / length
				if percent != last_percent:
					sys.stderr.write("%s%3d%%\r"
							 % (progress, percent))
					last_percent = percent
			debug(ord(src[in_pos]), "src")
			(match_pos, match_len) = self.add_suffix(in_pos, True)
			if match_len < MIN_MATCH_LEN:
				self.encode_char(ord(src[in_pos]))
			else:
				debug(in_pos - match_pos - 1, "match_pos")
				debug(match_len, "match_len")
				self.encode_char(256 - MIN_MATCH_LEN
						 + match_len)
				self.encode_position(in_pos - match_pos - 1)
				for i in range(match_len - 1):
					in_pos += 1
					self.add_suffix(in_pos, False)
			in_pos += 1

		self.shifts += 1
		if self.low < QUADRANT1:
			self.output_bit(0)
		else:
			self.output_bit(1)

		#for k, v in sorted(self.suffix_table.items()):
		#	count, head, table2, chars = v
		#	print hexlify(k), count, head, len(table2), chars

		if progress:
			sys.stderr.write("%s100%%\n" % progress)

		return bit_array_to_string(out_array)

	def decode(self, src, out_length, progress = None):
		"""Decompress a string."""

		a = string_to_bit_array(src)
		a.fromlist([0] * 32)	 # add some extra bits
		self.in_iter = iter(a).next

		out = array.array('B', "\0") * out_length
		outpos = 0

		self.init(True)

		self.code = 0
		for i in range(ARITH_BITS + 2):
			self.code += self.code + self.in_iter()

		hist_pos = HIST_LEN - MAX_MATCH_LEN
		history = [0x20] * hist_pos + [0] * MAX_MATCH_LEN

		decode_char = self.decode_char
		last_percent = -1
		last_time = time.time()
		while outpos < out_length:
			if progress:
				percent = outpos * 100 / out_length
				if percent != last_percent:
					now = time.time()
					if now - last_time >= 1:
						sys.stderr.write("%s%3d%%\r"
							% (progress, percent))
						last_percent = percent
						last_time = now
			char = decode_char()
			if char >= 0x100:
				pos = self.decode_position()
				length = char - 0x100 + MIN_MATCH_LEN
				base = (hist_pos - pos - 1) % HIST_LEN
				for off in range(length):
					a = history[(base + off) % HIST_LEN]
					out[outpos] = a
					outpos += 1
					history[hist_pos] = a
					hist_pos = (hist_pos + 1) % HIST_LEN
			else:
				out[outpos] = char
				outpos += 1
				history[hist_pos] = char
				hist_pos = (hist_pos + 1) % HIST_LEN

		self.in_iter = None
		if progress:
			sys.stderr.write("%s100%%\n" % progress)
		return out.tostring()

if mymcsup == None:
	def decode(src, out_length, progress = None):
		return lzari_codec().decode(src, out_length, progress)

	def encode(src, progress = None):
		return lzari_codec().encode(src, progress)
else:
	mylzari_decode = mymcsup.mylzari_decode
	mylzari_encode = mymcsup.mylzari_encode
	mylzari_free_encoded = mymcsup.mylzari_free_encoded

	def decode(src, out_length, progress = None):
		out = ctypes.create_string_buffer(out_length)
		if (mylzari_decode(src, len(src), out, out_length, progress)
		    == -1):
			raise ValueError, "compressed input is corrupt"
		return ctypes.string_at(out, out_length)

	def encode(src, progress = None):
		(r, compressed, comp_len) = mylzari_encode(src, len(src),
							   progress)
		# print r, compressed.value, comp_len
		if r == -1:
			raise MemoryError, "out of memory during compression"
		ret = ctypes.string_at(compressed.value, comp_len.value)
		mylzari_free_encoded(compressed)
		return ret;

def main2(args):
	import struct
	import os

	src = file(args[2], "rb").read()
	lzari = lzari_codec()
	out = file(args[3], "wb")
	start = os.times()
	if args[1] == "c":
		dest = lzari.encode(src)
		now = os.times()
		out.write(struct.pack("L", len(src)))
	else:
		dest = lzari.decode(src[4:],
				    struct.unpack("L", src[:4])[0])
		now = os.times()
	out.write(dest)
	out.close()
	print "time:", now[0] - start[0], now[1] - start[1], now[4] - start[4]


def _get_hotshot_lineinfo(filename):
	import hotshot.log
	log = hotshot.log.LogReader(filename)
	timings = {}
	for what, loc, tdelta in log:
		if what == hotshot.log.LINE:
			a = timings.get(loc)
			if a == None:
				timings[loc] = [1, tdelta]
			else:
				a[0] += 1
				a[1] += tdelta
	return timings.items()

def _dump_hotshot_lineinfo(log):
	a = sorted(_get_hotshot_lineinfo(log))
	total_count = sum((time[0]
			   for (loc, time) in a))
	total_time = sum((time[1]
			  for (loc, time) in a))
	for (loc, [count, time]) in a:
		print ("%8d %6.3f%%  %8d %6.3f%%"
		       % (time, time * 100.0 / total_time,
			  count, count * 100.0 / total_count)),
		print "%s:%d(%s)" % loc

def _dump_hotshot_lineinfo2(log):
	cur = None
	a = sorted(_get_hotshot_lineinfo(log))
	total_count = sum((time[0]
			   for (loc, time) in a))
	total_time = sum((time[1]
			  for (loc, time) in a))
	for ((filename, lineno, fn), [count, time]) in a:
		if cur != filename:
			if cur != None and f != None:
				for line in f:
					print line[:-1]
				f.close()
			try:
				f = file(filename, "r")
			except OSError:
				f = None
			cur = filename
			l = 0
			print "#", filename
		if f != None:
			while l < lineno:
				print f.readline()[:-1]
				l += 1
		print ("# %8d %6.3f%%  %8d %6.3f%%"
		       % (time, time * 100.0 / total_time,
			  count, count * 100.0 / total_count))
	if cur != None and f != None:
		for line in f:
			print line[:-1]
		f.close()

def main(args):
	import os

	if args[1] == "pc":
		import profile
		pr = profile.Profile()
		for i in range(5):
			print pr.calibrate(100000)
		return
	elif args[1] == "p":
		import profile
		ret = 0
		# profile.Profile.bias = 5.26e-6
		profile.runctx("ret = main2(args[1:])",
			       globals(), locals())
		return ret
	elif args[1].startswith("h"):
		import hotshot, hotshot.stats
		import warnings

		warnings.filterwarnings("ignore")
		tmp = os.tempnam()
		try:
			l = args[1].startswith("hl")
			p = hotshot.Profile(tmp, l)
			ret = p.runcall(main2, args[1:])
			p.close()
			p = None
			if l:
				if args[1] == "hl2":
					_dump_hotshot_lineinfo2(tmp)
				else:
					_dump_hotshot_lineinfo(tmp)
			else:
				hotshot.stats.load(tmp).print_stats()
		finally:
			try:
				os.remove(tmp)
			except OSError:
				pass
		return ret

	return main2(args)

if __name__ == '__main__':
	sys.exit(main(sys.argv))