1
0
mirror of synced 2025-02-17 11:18:33 +01:00

Lots more implementation done on decompiler, including better control flow and loop detection.

This commit is contained in:
Jennifer Taylor 2021-04-24 18:00:13 +00:00
parent b77ccdd5b9
commit 738fce36c9

View File

@ -1,5 +1,5 @@
import os
from typing import Any, Dict, List, Tuple, cast
from typing import Any, Dict, List, Tuple, Set, Union, Optional, cast
from .types import AP2Action, JumpAction, IfAction
from .util import VerboseOutput
@ -73,20 +73,129 @@ class ControlFlow:
class ByteCodeChunk:
def __init__(self, actions: List[AP2Action], next_chunk: List[int]) -> None:
def __init__(self, id: int, actions: List[AP2Action], next_chunks: List[int], previous_chunks: List[int] = []) -> None:
self.id = id
self.actions = actions
self.next_chunk = next_chunk
self.next_chunks = next_chunks
self.previous_chunks = previous_chunks or []
@property
def offset(self) -> int:
return self.actions[0].offset
def offset(self) -> Optional[int]:
if self.actions:
return self.actions[0].offset
return None
def __repr__(self) -> str:
entries: List[str] = []
for action in self.actions:
entries.extend([f" {s}" for s in str(action).split(os.linesep)])
return f"ByteCodeChunk({os.linesep}{os.linesep.join(entries)}{os.linesep} Next Offsets: {', '.join(str(n) for n in self.next_chunk) or 'None'}{os.linesep})"
return (
f"ByteCodeChunk({os.linesep}" +
f" ID: {self.id}{os.linesep}" +
(f" Previous Chunks: {', '.join(str(n) for n in self.previous_chunks)}{os.linesep}" if self.previous_chunks else f" Start Chunk{os.linesep}") +
f"{os.linesep.join(entries)}{os.linesep}" +
(f" Next Chunks: {', '.join(str(n) for n in self.next_chunks)}{os.linesep}" if self.next_chunks else f" End Chunk{os.linesep}") +
")"
)
ArbitraryCodeChunk = Union[ByteCodeChunk, "Loop"]
class Loop:
def __init__(self, id: int, chunks: List[ArbitraryCodeChunk]) -> None:
# The ID is usually the chunk that other chunks point into.
self.id = id
# Calculate predecessors (who points into it) and successors (who we point out of).
ided_chunks: Dict[int, ArbitraryCodeChunk] = {chunk.id: chunk for chunk in chunks}
self.previous_chunks: List[int] = []
self.next_chunks: List[int] = []
self.chunks = chunks
for chunk in chunks:
for nextid in chunk.next_chunks:
if nextid not in ided_chunks:
self.next_chunks.append(nextid)
for previd in chunk.previous_chunks:
if previd not in ided_chunks:
self.previous_chunks.append(previd)
@property
def offset(self) -> Optional[int]:
for chunk in self.chunks:
if chunk.id == self.id:
return chunk.offset
# We're guaranteed to have a haeder (the ID), so this is a problem.
raise Exception("Logic error!")
def __repr__(self) -> str:
entries: List[str] = []
for chunk in self.chunks:
entries.extend([f" {s}" for s in str(chunk).split(os.linesep)])
return (
f"Loop({os.linesep}" +
f" ID: {self.id}{os.linesep}" +
(f" Previous Chunks: {', '.join(str(n) for n in self.previous_chunks)}{os.linesep}" if self.previous_chunks else f" Start Chunk{os.linesep}") +
f"{os.linesep.join(entries)}{os.linesep}" +
(f" Next Chunks: {', '.join(str(n) for n in self.next_chunks)}{os.linesep}" if self.next_chunks else f" End Chunk{os.linesep}") +
")"
)
class BitVector:
def __init__(self, length: int, init: bool = False) -> None:
self.__bits: Dict[int, bool] = {i: init for i in range(length)}
def clone(self) -> "BitVector":
new = BitVector(len(self.__bits))
new.__bits = {i: self.__bits[i] for i in self.__bits}
return new
def setAllBitsTo(self, val: bool) -> "BitVector":
self.__bits = {i: val for i in self.__bits}
return self
def setBit(self, bit: int) -> "BitVector":
self.__bits[bit] = True
return self
def clearBit(self, bit: int) -> "BitVector":
self.__bits[bit] = False
return self
def orVector(self, other: "BitVector") -> "BitVector":
if len(self.__bits) != len(other.__bits):
raise Exception("Cannot or different-sized bitvectors!")
self.__bits = {i: (self.__bits[i] or other.__bits[i]) for i in self.__bits}
return self
def andVector(self, other: "BitVector") -> "BitVector":
if len(self.__bits) != len(other.__bits):
raise Exception("Cannot and different-sized bitvectors!")
self.__bits = {i: (self.__bits[i] and other.__bits[i]) for i in self.__bits}
return self
def __eq__(self, other: object) -> bool:
if not isinstance(other, BitVector):
return NotImplemented
if len(self.__bits) != len(other.__bits):
raise Exception("Cannot compare different-sized bitvectors!")
for i in self.__bits:
if self.__bits[i] != other.__bits[i]:
return False
return True
def __ne__(self, other: object) -> bool:
return not self.__eq__(other)
@property
def bitsSet(self) -> Set[int]:
return {i for i in self.__bits if self.__bits[i]}
class ByteCodeDecompiler(VerboseOutput):
@ -250,6 +359,7 @@ class ByteCodeDecompiler(VerboseOutput):
# Finally, return chunks of contiguous execution.
chunks: List[ByteCodeChunk] = []
chunkid: int = 0
for start, flow in flows.items():
if start == end:
# We don't want to render out the end of the graph, it was only there to make
@ -258,25 +368,188 @@ class ByteCodeDecompiler(VerboseOutput):
if len(flow.next_flow) == 1 and flow.next_flow[0] == end:
# This flow is a termination state.
chunks.append(ByteCodeChunk(self.bytecode.actions[flow.beginning:flow.end], []))
chunks.append(ByteCodeChunk(chunkid, self.bytecode.actions[flow.beginning:flow.end], []))
chunkid += 1
else:
next_chunks: List[int] = []
for ano in flow.next_flow:
if ano == end:
raise Exception("Logic error!")
next_chunks.append(self.bytecode.actions[ano].offset)
chunks.append(ByteCodeChunk(self.bytecode.actions[flow.beginning:flow.end], next_chunks))
chunks.append(ByteCodeChunk(chunkid, self.bytecode.actions[flow.beginning:flow.end], next_chunks))
chunkid += 1
return sorted(chunks, key=lambda c: c.offset)
# Calculate who points to us as well, for posterity.
entries: Dict[int, List[int]] = {}
offset_to_id: Dict[int, int] = {}
for chunk in chunks:
offset_to_id[chunk.offset] = chunk.id
for next_chunk in chunk.next_chunks:
entries[next_chunk] = entries.get(next_chunk, []) + [chunk.offset]
def decompile(self, verbose: bool = False) -> str:
with self.debugging(verbose):
return self.__decompile()
for chunk in chunks:
chunk.previous_chunks = entries.get(chunk.offset, [])
# Now, convert the offsets to chunk ID pointers.
end_previous_chunks: List[int] = []
for chunk in chunks:
if chunk.next_chunks:
# Normal chunk.
chunk.next_chunks = [offset_to_id[c] for c in chunk.next_chunks]
else:
# Point this chunk at the end of bytecode sentinel.
chunk.next_chunks = [chunkid]
end_previous_chunks.append(chunk.id)
chunk.previous_chunks = [offset_to_id[c] for c in chunk.previous_chunks]
chunks.append(ByteCodeChunk(chunkid, [], [], previous_chunks=end_previous_chunks))
return sorted(chunks, key=lambda c: c.id)
def __get_entry_block(self, chunks: List[ByteCodeChunk]) -> int:
start_id: int = -1
for chunk in chunks:
if not chunk.previous_chunks:
if start_id != -1:
# This should never happen, we have one entrypoint. If we run into
# this we might need to do dead code analysis and discarding.
raise Exception("Logic error!")
start_id = chunk.id
if start_id == -1:
# We should never get to this as we always have at least one entrypoint.
raise Exception("Logic error!")
return start_id
def __compute_dominators(self, chunks: List[ByteCodeChunk]) -> Dict[int, Set[int]]:
# Find the start of the graph (the node with no previous entries).
start_id = self.__get_entry_block(chunks)
# Compute dominators recursively
chunklen = len(chunks)
dominators: Dict[int, BitVector] = {chunk.id: BitVector(chunklen, init=True) for chunk in chunks}
dominators[start_id].setAllBitsTo(False).setBit(start_id)
changed = True
while changed:
changed = False
for chunk in chunks:
if chunk.id == start_id:
continue
for previd in chunk.previous_chunks:
comparison = dominators[chunk.id].clone()
dominators[chunk.id].andVector(dominators[previd]).setBit(chunk.id)
if dominators[chunk.id] != comparison:
changed = True
return {chunk.id: dominators[chunk.id].bitsSet for chunk in chunks}
def __separate_loops(self, chunks: List[ByteCodeChunk], dominators: Dict[int, Set[int]]) -> List[Union[ByteCodeChunk, Loop]]:
# Find the start of the graph (the node with no previous entries).
start_id = self.__get_entry_block(chunks)
chunks_by_id: Dict[int, Union[ByteCodeChunk, Loop]] = {chunk.id: chunk for chunk in chunks}
# Go through and gather up all loops in the chunks.
loops: Dict[int, Set[int]] = {}
for chunk in chunks:
if chunk.id == start_id:
continue
for nextid in chunk.next_chunks:
# If this next chunk dominates us, then that means we found a loop.
if nextid in dominators[chunk.id]:
# Calculate the blocks that are in this loop.
header = nextid
tail = chunk.id
blocks = {header}
# If we don't already have a loop of one block,
# we need to walk backwards to find all blocks in this
# loop.
if header != tail:
blocks.add(tail)
blocks_to_examine = [tail]
while blocks_to_examine:
block = blocks_to_examine.pop()
for predecessor in chunks_by_id[block].previous_chunks:
if predecessor not in blocks:
blocks.add(predecessor)
blocks_to_examine.append(predecessor)
self.vprint(f"Found loop with header {header} and blocks {', '.join(str(b) for b in blocks)}.")
# We found a loop!
if header in loops:
raise Exception("Logic error!")
loops[header] = blocks
# Now, we need to reduce our list of chunks down to non-loops only. We do this
# by recursively trying to find inner loops until we find a loop that has no
# inner loops, and converting that. Once we do that, we remove the chunks from
# our list, add it to that new loop, and convert all other loops that might
# reference it to point at the loop instead.
while loops:
delete_header: Optional[int] = None
delete_blocks: Set[int] = set()
for header, blocks in loops.items():
# See if any of the blocks in this loop are the header of any other loop.
for block in blocks:
if block in loops and loops[block] is not blocks:
# This particular block of code is the header of another loop,
# so we shouldn't convert this loop until we handle the inner
# loop.
break
else:
# This loop does not contain any loops of its own. It is safe to
# convert.
self.vprint(f"Converting loop with header {header} and blocks {', '.join(str(b) for b in blocks)}.")
chunks_by_id[header] = Loop(header, [chunks_by_id[i] for i in blocks])
# These blocks are now part of the loop, so we need to remove them
# from the IDed chunks as well as from existing loops.
delete_blocks = {block for block in blocks if block != header}
delete_header = header
break
if delete_header is None:
# We must find at LEAST one loop that has no inner loops of its own.
raise Exception("Logic error!")
# Remove this loop from the processing list
del loops[delete_header]
# Go through and remove the rest of the chunks from the rest of the loops
loops = {header: {block for block in blocks if block not in delete_blocks} for (header, blocks) in loops.items()}
# Also remove the rest of the chunks from our IDed chunks as they are part of this loop now.
for block in delete_blocks:
del chunks_by_id[block]
# Verify that we don't have any existing chunks that point at the non-header portion of the loop.
for _, chunk_or_loop in chunks_by_id.items():
for nextid in chunk_or_loop.next_chunks:
if nextid in delete_blocks:
# Woah, we point at a chunk inside this loop that isn't the header!
raise Exception("Logic error!")
return [chunks_by_id[i] for i in chunks_by_id]
def __decompile(self) -> str:
# First, we need to construct a control flow graph.
chunks = self.__graph_control_flow()
self.vprint(chunks)
# Now, compute dominators so we can locate back-refs.
dominators = self.__compute_dominators(chunks)
# Now, separate chunks out into chunks and loops.
chunks_and_loops = self.__separate_loops(chunks, dominators)
self.vprint(chunks_and_loops)
return "TODO"
def decompile(self, verbose: bool = False) -> str:
with self.debugging(verbose):
return self.__decompile()