1
0
mirror of synced 2024-12-15 15:51:15 +01:00
bemaniutils/bemani/protocol/binary.py

881 lines
33 KiB
Python
Raw Normal View History

import struct
from typing import Optional, List, Dict, Any
from typing_extensions import Final
from bemani.protocol.stream import InputStream, OutputStream
from bemani.protocol.node import Node
class BinaryEncodingException(Exception):
"""
Generic exception to be thrown when we encounter an issue decoding a binary stream
"""
class PackedOrdering:
"""
A class that helps us encapsulate Konami's batshit backtracking hole-fill algorithm.
Everything is aligned on a boundary appropriate for its data size. Strings and arrays are
forced to be aligned to a 4 byte boundary on account of having an integer length field.
All of these are padded to 4 bytes in terms of the room they take up in the stream.
For the things that are 2 byte or 1 byte aligned, we end up sticking them after each other
in 4 byte increments. That is, to say, if we have a unsigned byte to pack, we reserve 4 bytes
and stick it in the first byte slot, and if up to three additional bytes come in we will pack
them after this in sequential order. It would make sense to not pad out strings and arrays and
store bytes/shorts in these unused locations, but that's not what actually happens. Also note
that we will never pack bytes after a short or vice versa, even if there is room. This also explains
the bizarre behavior of not using spare bytes after strings or arrays. I'll emphasize again:
everything is stored aligned, and in a 4 byte chunk, only similarly-sized objects can be packed. If
this 4 byte chunk is already partially occupied, we can only add another thing to it if 1) the
item being added is the same size as the object that exists and 2) the object can be added with
the correct alignment.
A simple example:
[1: byte] [2: byte] [3: integer]
Packing would look like this (assuming all locations are a byte):
1 2 0 0 3 3 3 3
An example:
[1: byte] [2: string, length 3] [3: short] [4: byte]
Packing would look like this (assuming all locations are a byte):
1 4 0 0 2 2 2 2 2 2 2 0 3 3 0 0
"""
def __init__(self, size: int, allow_expansion: bool=False) -> None:
"""
Initialize with a known size. If this is to be used to create a packing instead of deduce
a packing, then allow_expansion should be set to true and new holes will be created when
needed. If this is to be used for decoding a current packing, allow_expansion should be set
to False to ensure we don't choose locations outside the buffer.
Parameters:
size - Number of bytes to work with as an integer
allow_expansion - Boolean describing whether to add to the end of the order when needed
"""
self.order: List[Optional[int]] = []
self.expand = allow_expansion
for _ in range(size):
self.order.append(None)
self.__orderlen = size
self.__lastbyte = 0
self.__lastshort = 0
self.__lastint = 0
def __append_empty(self) -> None:
self.order.append(None)
self.__orderlen = self.__orderlen + 1
def mark_used(self, size: int, offset: int, round_to: int=1) -> None:
"""
Mark size bytes at offset as being used. If needed, round to the nearest byte/half/integer.
Parameters:
size - Number of bytes to mark
offset - Offset into binary chunk to start marking
round_to - Optional integer specifying how many bytes to round to. Valid values are 1, 2 and 4
"""
# Round to nearest value if needed
while (size & (round_to - 1)) != 0:
size = size + 1
# Expand buffer if needed
if self.expand:
while self.__orderlen < (size + offset):
self.__append_empty()
# Mark buffer as used
for i in range(size):
self.order[i + offset] = size
def get_next_byte(self) -> Optional[int]:
"""
Returns an integer location where the next byte will be found/stored, respecting Konami logic.
Will return None if its not possible to find this integer a spot and we aren't expanding.
"""
# If we expand for additions, make sure we've padded to a 4 byte boundary
if self.expand:
while (self.__orderlen & 3) != 0:
self.__append_empty()
for i in range(self.__lastbyte, self.__orderlen, 4):
if self.order[i] is not None:
# See if this has room for a byte
for j in range(0, 4):
if self.order[i + j] == 1:
# This is okay, we can pack after this
continue
elif self.order[i + j] is None:
# This is open, pack here
self.__lastbyte = i
return i + j
else:
# This is something else, can't pack here
break
else:
# Couldn't find optimal packing, pack here
self.__lastbyte = i
return i
if self.expand:
self.__lastbyte = self.__orderlen
return self.__orderlen
else:
return None
def get_next_short(self) -> Optional[int]:
"""
Returns an integer location where the next short will be found/stored, respecting Konami logic.
Will return None if its not possible to find this integer a spot and we aren't expanding.
"""
# If we expand for additions, make sure we've padded to a 4 byte boundary
if self.expand:
while (self.__orderlen & 3) != 0:
self.__append_empty()
for i in range(self.__lastshort, self.__orderlen, 4):
if self.order[i] is not None:
for j in range(0, 4, 2):
if self.order[i + j] == 2 and self.order[i + j + 1] == 2:
# This is okay, we can pack after this
continue
elif self.order[i + j] is None and self.order[i + j + 1] is None:
# This is open, pack here
self.__lastshort = i
return i + j
else:
# This is something else, can't pack here
break
else:
# Couldn't find optimal packing, pack here
self.__lastshort = i
return i
if self.expand:
self.__lastshort = self.__orderlen
return self.__orderlen
else:
return None
def get_next_int(self) -> Optional[int]:
"""
Returns an integer location where the next integer will be found/stored, respecting Konami logic.
Will return None if its not possible to find this integer a spot and we aren't expanding.
"""
# If we expand for additions, make sure we've padded to a 4 byte boundary
if self.expand:
while (self.__orderlen & 3) != 0:
self.__append_empty()
for i in range(self.__lastint, self.__orderlen, 4):
if self.order[i] is not None:
continue
if self.order[i + 1] is not None:
continue
if self.order[i + 2] is not None:
continue
if self.order[i + 3] is not None:
continue
self.__lastint = i
return i
if self.expand:
self.__lastint = self.__orderlen
return self.__orderlen
else:
return None
@staticmethod
def node_to_body_ordering(node: Node, include_children: bool=True, include_void: bool=False) -> List[Dict[str, Any]]:
"""
Walk this node, attributes and children in the correct order to create a node
ordering for the purpose of mapping Node objects to their actual data
in a binary packet data chunk. We will use this to unpack data to determine the
values of nodes, or to create the data that goes with these nodes.
Paramters:
include_children - Whether this ordering should include children. Defaults to True.
include_void - Whether this ordering should include positions for void nodes. Defaults
to false.
Returns:
List of dictionary objects:
- type - 'attribute' or 'value' to specify that this position in the
node walk is a string attribute or a node value
- node - This Node object, for the purpose of assignment
- name - The name of the attribute if type is 'attribute' or the name
of the node if type is 'value'
- alignment - The alignment that this particular data object requiers
"""
ordering = []
# Include the node itself if it has a value or we include voids
if node.data_length != 0 or include_void:
alignment = node.data_length
if alignment is None:
# Take care of string types
alignment = 4
if alignment > 4:
# Take care of 64 bit integers that are 32 bit aligned
alignment = 4
ordering.append({
'type': 'value',
'node': node,
'name': node.name,
'alignment': alignment,
})
order = sorted(node.attributes.keys())
for attr in order:
ordering.append({
'type': 'attribute',
'node': node,
'name': attr,
'alignment': 4,
})
if include_children:
for child in node.children:
ordering.extend(PackedOrdering.node_to_body_ordering(child))
return ordering
class BinaryDecoder:
"""
A class capable of taking a binary blob and decoding it to a Node tree.
"""
def __init__(self, data: bytes, encoding: str, compressed: bool) -> None:
"""
Initialize the object.
Parameters:
- data - A binary blob of data to be decoded
- encoding - A string representing the text encoding for string elements. Should be either
'shift-jis', 'euc-jp' or 'utf-8'
"""
self.stream = InputStream(data)
self.encoding = encoding
self.compressed = compressed
self.executed = False
def __read_node_name(self) -> str:
"""
Given the current position in the stream, read the 6-bit-byte packed string name of the
node.
Returns:
A string representing the name in ascii
"""
length = self.stream.read_int()
if length is None:
raise BinaryEncodingException("Ran out of data when attempting to read node name length!")
if not self.compressed:
if length < 0x40:
raise BinaryEncodingException("Node name length under decompressed minimum")
elif length < 0x80:
length -= 0x3f
else:
length_ex = self.stream.read_int()
if length_ex is None:
raise BinaryEncodingException("Ran out of data when attempting to read node name length!")
length = (length << 8) | length_ex
length -= 0x7fbf
if length > BinaryEncoding.NAME_MAX_DECOMPRESSED:
raise BinaryEncodingException("Node name length over decompressed limit")
name = self.stream.read_blob(length)
if name is None:
raise BinaryEncodingException("Ran out of data when attempting to read node name!")
return name.decode(self.encoding)
if length > BinaryEncoding.NAME_MAX_COMPRESSED:
raise BinaryEncodingException("Node name length over compressed limit")
binary_length = int(((length * 6) + 7) / 8)
def int_to_bin(integer: int) -> str:
val = bin(integer)[2:]
while len(val) < 8:
val = '0' + val
return val
data = ''
for _ in range(binary_length):
next_byte = self.stream.read_int()
if next_byte is None:
raise BinaryEncodingException("Ran out of data when attempting to read node name!")
data = data + int_to_bin(next_byte)
data_str = [data[i:(i + 6)] for i in range(0, len(data), 6)]
data_int = [int(val, 2) for val in data_str]
ret = ''.join([Node.NODE_NAME_CHARS[val] for val in data_int])
ret = ret[:length]
return ret
def __read_node(self, node_type: int) -> Node:
"""
Given an integer node type, read the node's name, possible attributes
and children. Will return a Node representing this node. Note
that calling this on the first node should return a tree of all nodes.
Returns:
Node object
"""
name = self.__read_node_name()
node = Node(name=name, type=node_type)
while True:
child_type = self.stream.read_int()
if child_type is None:
raise BinaryEncodingException("Ran out of data when attempting to read node type!")
if child_type == Node.END_OF_NODE:
return node
elif child_type == Node.ATTR_TYPE:
key = self.__read_node_name()
node.set_attribute(key)
else:
child = self.__read_node(child_type)
node.add_child(child)
def get_tree(self) -> Node:
"""
Parse the header and body such that we can return a Node tree
representing the data passed to us.
Returns:
Node object
"""
if self.executed:
raise BinaryEncodingException("Logic error, should only call this once per instance")
self.executed = True
# Read the header first
header_length = self.stream.read_int(4)
if header_length is None:
raise BinaryEncodingException("Ran out of data when attempting to read header length!")
node_type = self.stream.read_int()
if node_type is None:
raise BinaryEncodingException("Ran out of data when attempting to read root node type!")
root = self.__read_node(node_type)
eod = self.stream.read_int()
if eod != Node.END_OF_DOCUMENT:
raise BinaryEncodingException(f'Unknown node type {eod} at end of document')
# Skip by any padding
while self.stream.pos < header_length + 4:
self.stream.read_byte()
# Read the body next
body_length = self.stream.read_int(4)
if body_length is not None and body_length > 0:
# We have a body
body = self.stream.read_blob(body_length)
if body is None:
raise BinaryEncodingException('Body has insufficient data')
ordering = PackedOrdering(body_length)
values = PackedOrdering.node_to_body_ordering(root)
for value in values:
node = value['node']
if value['type'] == 'attribute':
size = None
enc = 's'
dtype = 'str'
array = False
composite = False
else:
size = node.data_length
enc = node.data_encoding
dtype = node.data_type
array = node.is_array
composite = node.is_composite
if composite and array:
raise Exception('Logic error, no support for composite arrays!')
if not array:
# Scalar value
alignment = value['alignment']
if alignment == 1:
loc = ordering.get_next_byte()
elif alignment == 2:
loc = ordering.get_next_short()
elif alignment == 4:
loc = ordering.get_next_int()
if loc is None:
raise BinaryEncodingException("Ran out of data when attempting to read node data location!")
if size is None:
# The size should be read from the first 4 bytes
size = struct.unpack('>I', body[loc:(loc + 4)])[0]
ordering.mark_used(size + 4, loc, round_to=4)
loc = loc + 4
decode_data = body[loc:(loc + size)]
decode_value = f'>{size}{enc}'
else:
# The size is built-in
ordering.mark_used(size, loc)
decode_data = body[loc:(loc + size)]
decode_value = f'>{enc}'
if composite:
val_list = list(struct.unpack(decode_value, decode_data))
if value['type'] == 'attribute':
raise Exception('Logic error, shouldn\'t have composite attribute type!')
node.set_value(val_list)
continue
val = struct.unpack(decode_value, decode_data)[0]
if dtype == 'str':
# Need to convert this from encoding to standard string.
# Also, need to lob off the trailing null.
try:
val = val[:-1].decode(self.encoding, 'replace')
except UnicodeDecodeError:
# Nothing we can do here
pass
if value['type'] == 'attribute':
node.set_attribute(value['name'], val)
else:
node.set_value(val)
else:
# Array value
loc = ordering.get_next_int()
if loc is None:
raise BinaryEncodingException("Ran out of data when attempting to read array length location!")
# The raw size in bytes
length = struct.unpack('>I', body[loc:(loc + 4)])[0]
elems = int(length / size)
ordering.mark_used(length + 4, loc, round_to=4)
loc = loc + 4
decode_data = body[loc:(loc + length)]
decode_value = f'>{enc * elems}'
val = struct.unpack(decode_value, decode_data)
node.set_value([v for v in val])
return root
class BinaryEncoder:
"""
A class capable of taking a Node tree and encoding it into a binary format.
"""
def __init__(self, tree: Node, encoding: str, compressed: bool=True) -> None:
"""
Initialize the object.
Parameters:
tree - A binary blob of data to be decoded
encoding - A string representing the text encoding for string elements. Should be either
'shift-jis', 'euc-jp' or 'utf-8'
"""
self.stream = OutputStream()
self.encoding = encoding
self.tree = tree
self.__body: List[int] = []
self.__body_len = 0
self.executed = False
self.compressed = compressed
# Generate the characer LUT
self.char_lut: Dict[str, int] = {}
for i in range(len(Node.NODE_NAME_CHARS)):
self.char_lut[Node.NODE_NAME_CHARS[i]] = i
def __write_node_name(self, name: str) -> None:
"""
Given the current position in the stream, write the 6-bit-byte packed string name of the
node.
Parameters:
name - A string name which should be encoded as a node name
"""
if not self.compressed:
encoded = name.encode(self.encoding)
length = len(encoded)
if length > BinaryEncoding.NAME_MAX_DECOMPRESSED:
raise BinaryEncodingException("Node name length over decompressed limit")
if length < 64:
self.stream.write_int(length + 0x3f)
else:
length += 0x7fbf
self.stream.write_int((length >> 8) & 0xff)
self.stream.write_int(length & 0xff)
self.stream.write_blob(encoded)
return
def char_to_bin(ch: str) -> str:
index = self.char_lut[ch]
val = bin(index)[2:]
while len(val) < 6:
val = '0' + val
return val[-6:]
# Convert to six bit bytes
length = len(name)
data = ''.join([char_to_bin(c) for c in name])
# Pad out the rest with zeros
while (len(data) & 0x7) != 0:
data = data + '0'
# Convert to 8-bit bytes
data_chunks = [data[i:(i + 8)] for i in range(0, len(data), 8)]
data_int = [int(val, 2) for val in data_chunks]
# Output
self.stream.write_int(length)
for val in data_int:
self.stream.write_int(val)
def __write_node(self, node: Node) -> None:
"""
Given an integer node type, read the node's name, possible attributes
and children. Will return a Node representing this node. Note
that calling this on the first node should return a tree of all nodes.
Parameters:
node - A Node which should be encoded.
"""
to_write = PackedOrdering.node_to_body_ordering(node, include_children=False, include_void=True)
for thing in to_write:
# First, write the type of this node out
if thing['type'] == 'value':
self.stream.write_int(thing['node'].type)
else:
self.stream.write_int(Node.ATTR_TYPE)
# Now, write the name out
self.__write_node_name(thing['name'])
# Now, write out the children
for child in node.children:
self.__write_node(child)
# Now, write out the end of node marker
self.stream.write_int(Node.END_OF_NODE)
def __add_data(self, data: bytes, length: int, offset: int) -> None:
"""
Given some binary data, a length and an offset, add the data to the offset in the
output body. This function will ensure that any new bytes that aren't copied are
zero'd out. This includes bytes before the offset as well as any pad bytes after
the offset + length in order to pad this body to a 4 byte boundary.
Parameters:
data - A blob of binary data which should be copied into the output
length - Number of characters of data to copy
offset - Offset into the body to start copying
"""
while self.__body_len < (length + offset):
self.__body.append(0)
self.__body_len = self.__body_len + 1
# Make sure its padded to 4 bytes
while (self.__body_len & 0x3) != 0:
self.__body.append(0)
self.__body_len = self.__body_len + 1
for i in range(length):
self.__body[offset + i] = data[i]
def get_data(self) -> bytes:
"""
Encode the header and body into binary formrt.
Returns:
Binary blob of data that can be decoded by a game.
"""
if self.executed:
raise Exception("Logic error, should only call this once per instance")
self.executed = True
# Generate the header first
self.__write_node(self.tree)
self.stream.write_int(Node.END_OF_DOCUMENT)
self.stream.write_pad(4)
header_length = len(self.stream.data)
header = self.stream.data[:]
# Generate the body
values = PackedOrdering.node_to_body_ordering(self.tree)
if len(values) > 0:
ordering = PackedOrdering(0, allow_expansion=True)
for value in values:
node = value['node']
if value['type'] == 'attribute':
size = None
enc = 's'
dtype = 'str'
array = False
composite = False
val = node.attribute(value['name'])
else:
size = node.data_length
enc = node.data_encoding
dtype = node.data_type
array = node.is_array
composite = node.is_composite
val = node.value
if val is None:
raise BinaryEncodingException(
f'Node \'{value["name"]}\' has invalid value None',
)
if not array:
# Scalar value
alignment = value['alignment']
if alignment == 1:
loc = ordering.get_next_byte()
elif alignment == 2:
loc = ordering.get_next_short()
elif alignment == 4:
loc = ordering.get_next_int()
if loc is None:
raise BinaryEncodingException("Ran out of data when attempting to allocate node location!")
if dtype == 'str':
# Need to convert this to encoding from standard string.
# Also, need to lob off the trailing null.
if not isinstance(val, str):
raise BinaryEncodingException(
f'Node \'{value["name"]}\' has non-string value!',
)
try:
valbytes = val.encode(self.encoding) + b'\0'
except UnicodeEncodeError:
raise BinaryEncodingException(
f'Node \'{value["name"]}\' has un-encodable string value \'{val}\''
)
size = len(valbytes)
self.__add_data(struct.pack('>I', size) + valbytes, size + 4, loc)
ordering.mark_used(size + 4, loc, round_to=4)
# We took care of this one
continue
elif dtype == 'bin':
# Store raw binary
size = len(val)
self.__add_data(struct.pack('>I', size) + val, size + 4, loc)
ordering.mark_used(size + 4, loc, round_to=4)
# We took care of this one
continue
elif composite:
# Array, but not, somewhat silly
if size is None:
raise Exception("Logic error, node size not set yet this is not an attribute!")
encode_value = f'>{enc}'
self.__add_data(struct.pack(encode_value, *val), size, loc)
ordering.mark_used(size, loc)
# We took care of this one
continue
elif dtype == 'bool':
val = 1 if val else 0
# The size is built-in, emit it
if size is None:
raise Exception("Logic error, node size not set yet this is not an attribute!")
encode_value = f'>{enc}'
self.__add_data(struct.pack(encode_value, val), size, loc)
ordering.mark_used(size, loc)
else:
# Array value
loc = ordering.get_next_int()
if loc is None:
raise BinaryEncodingException("Ran out of data when attempting allocate array location!")
if size is None:
raise Exception("Logic error, node size not set yet this is not an attribute!")
# The raw size in bytes
elems = len(val)
length = elems * size
# Write out the header (number of bytes taken up)
data = struct.pack('>I', length)
encode_value = f'>{enc}'
# Write out data one element at a time
for v in val:
if dtype == 'bool':
data = data + struct.pack(encode_value, 1 if v else 0)
else:
data = data + struct.pack(encode_value, v)
self.__add_data(data, length + 4, loc)
ordering.mark_used(length + 4, loc, round_to=4)
return b''.join([
struct.pack('>I', header_length),
header,
struct.pack('>I', self.__body_len),
bytes(self.__body),
])
class BinaryEncoding:
"""
Wrapper class representing a Binary Encoding.
"""
MAGIC: Final[int] = 0xA0
COMPRESSED_WITH_DATA: Final[int] = 0x42
COMPRESSED_WITHOUT_DATA: Final[int] = 0x43
DECOMPRESSED_WITH_DATA: Final[int] = 0x45
DECOMPRESSED_WITHOUT_DATA: Final[int] = 0x46
NAME_MAX_COMPRESSED: Final[int] = 0x24
NAME_MAX_DECOMPRESSED: Final[int] = 0x1000
# The string values should match the constants in EAmuseProtocol.
# I have no better way to link these than to write this comment,
# as otherwise we would have a circular dependency.
ENCODINGS: Final[Dict[int, str]] = {
0x00: "ascii",
0x20: "shift-jis-legacy",
0x60: "euc-jp",
0x80: "shift-jis",
0xA0: "utf-8",
}
def __init__(self) -> None:
"""
Initialize the encoding object.
"""
self.encoding: Optional[str] = None
self.compressed: bool = True
def __sanitize_encoding(self, enc: str) -> str:
"""
Convert an internal encoding value from an externally acceptible value.
Parameters:
enc - The encoding as a string as passed from an outside caller
Returns:
An encoding string suitable for internal use.
"""
if enc == "shift-jis-legacy":
return "shift-jis"
return enc
def decode(self, data: bytes, skip_on_exceptions: bool=False) -> Optional[Node]:
"""
Given a data blob, decode the data with the current encoding. Will
also set the class property value 'encoding' to the encoding used
on the last decode.
Parameters:
data - Binary blob representing the data to decode
Returns:
Node object representing the root of the decoded tree, or None
if we couldn't decode the object for some reason.
"""
try:
data_magic, contents, encoding_raw, encoding_swapped = struct.unpack(">BBBB", data[0:4])
except struct.error:
# Couldn't even parse magic
return None
if data_magic != BinaryEncoding.MAGIC:
return None
if ((~encoding_raw) & 0xFF) != encoding_swapped:
return None
self.compressed = contents in [
BinaryEncoding.COMPRESSED_WITH_DATA, BinaryEncoding.COMPRESSED_WITHOUT_DATA
]
if not self.compressed and contents not in [
BinaryEncoding.DECOMPRESSED_WITH_DATA, BinaryEncoding.DECOMPRESSED_WITHOUT_DATA
]:
return None
encoding = BinaryEncoding.ENCODINGS.get(encoding_raw)
if encoding is not None:
self.encoding = encoding
try:
decoder = BinaryDecoder(
data[4:], self.__sanitize_encoding(encoding), self.compressed
)
return decoder.get_tree()
except BinaryEncodingException:
if skip_on_exceptions:
return None
else:
raise
else:
return None
def encode(self, tree: Node, encoding: Optional[str]=None, compressed: bool=True) -> bytes:
"""
Given a tree of Node objects, encode the data with the current encoding.
Parameters:
tree - Node tree representing the data to encode
encoding - The text encoding to use. If None, will try to use the encoding from
the last successful decode
Returns:
Binary blob representing encoded data
"""
if encoding is None:
encoding = self.encoding
if encoding is None:
raise BinaryEncodingException('Unknown encoding')
encoding_magic = None
for magic, encstr in BinaryEncoding.ENCODINGS.items():
if encstr == encoding:
encoding_magic = magic
break
if encoding_magic is None:
raise BinaryEncodingException(f"Invalid text encoding {encoding}")
encoder = BinaryEncoder(tree, self.__sanitize_encoding(encoding), compressed)
data = encoder.get_data()
return struct.pack(
">BBBB",
BinaryEncoding.MAGIC,
BinaryEncoding.COMPRESSED_WITH_DATA if compressed else BinaryEncoding.DECOMPRESSED_WITH_DATA,
encoding_magic,
(~encoding_magic & 0xFF)
) + data