Lots more implementation done on decompiler, including better control flow and loop detection.
This commit is contained in:
parent
b77ccdd5b9
commit
738fce36c9
@ -1,5 +1,5 @@
|
||||
import os
|
||||
from typing import Any, Dict, List, Tuple, cast
|
||||
from typing import Any, Dict, List, Tuple, Set, Union, Optional, cast
|
||||
|
||||
from .types import AP2Action, JumpAction, IfAction
|
||||
from .util import VerboseOutput
|
||||
@ -73,20 +73,129 @@ class ControlFlow:
|
||||
|
||||
|
||||
class ByteCodeChunk:
|
||||
def __init__(self, actions: List[AP2Action], next_chunk: List[int]) -> None:
|
||||
def __init__(self, id: int, actions: List[AP2Action], next_chunks: List[int], previous_chunks: List[int] = []) -> None:
|
||||
self.id = id
|
||||
self.actions = actions
|
||||
self.next_chunk = next_chunk
|
||||
self.next_chunks = next_chunks
|
||||
self.previous_chunks = previous_chunks or []
|
||||
|
||||
@property
|
||||
def offset(self) -> int:
|
||||
return self.actions[0].offset
|
||||
def offset(self) -> Optional[int]:
|
||||
if self.actions:
|
||||
return self.actions[0].offset
|
||||
return None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
entries: List[str] = []
|
||||
for action in self.actions:
|
||||
entries.extend([f" {s}" for s in str(action).split(os.linesep)])
|
||||
|
||||
return f"ByteCodeChunk({os.linesep}{os.linesep.join(entries)}{os.linesep} Next Offsets: {', '.join(str(n) for n in self.next_chunk) or 'None'}{os.linesep})"
|
||||
return (
|
||||
f"ByteCodeChunk({os.linesep}" +
|
||||
f" ID: {self.id}{os.linesep}" +
|
||||
(f" Previous Chunks: {', '.join(str(n) for n in self.previous_chunks)}{os.linesep}" if self.previous_chunks else f" Start Chunk{os.linesep}") +
|
||||
f"{os.linesep.join(entries)}{os.linesep}" +
|
||||
(f" Next Chunks: {', '.join(str(n) for n in self.next_chunks)}{os.linesep}" if self.next_chunks else f" End Chunk{os.linesep}") +
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
ArbitraryCodeChunk = Union[ByteCodeChunk, "Loop"]
|
||||
|
||||
|
||||
class Loop:
|
||||
def __init__(self, id: int, chunks: List[ArbitraryCodeChunk]) -> None:
|
||||
# The ID is usually the chunk that other chunks point into.
|
||||
self.id = id
|
||||
|
||||
# Calculate predecessors (who points into it) and successors (who we point out of).
|
||||
ided_chunks: Dict[int, ArbitraryCodeChunk] = {chunk.id: chunk for chunk in chunks}
|
||||
|
||||
self.previous_chunks: List[int] = []
|
||||
self.next_chunks: List[int] = []
|
||||
self.chunks = chunks
|
||||
|
||||
for chunk in chunks:
|
||||
for nextid in chunk.next_chunks:
|
||||
if nextid not in ided_chunks:
|
||||
self.next_chunks.append(nextid)
|
||||
for previd in chunk.previous_chunks:
|
||||
if previd not in ided_chunks:
|
||||
self.previous_chunks.append(previd)
|
||||
|
||||
@property
|
||||
def offset(self) -> Optional[int]:
|
||||
for chunk in self.chunks:
|
||||
if chunk.id == self.id:
|
||||
return chunk.offset
|
||||
# We're guaranteed to have a haeder (the ID), so this is a problem.
|
||||
raise Exception("Logic error!")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
entries: List[str] = []
|
||||
for chunk in self.chunks:
|
||||
entries.extend([f" {s}" for s in str(chunk).split(os.linesep)])
|
||||
|
||||
return (
|
||||
f"Loop({os.linesep}" +
|
||||
f" ID: {self.id}{os.linesep}" +
|
||||
(f" Previous Chunks: {', '.join(str(n) for n in self.previous_chunks)}{os.linesep}" if self.previous_chunks else f" Start Chunk{os.linesep}") +
|
||||
f"{os.linesep.join(entries)}{os.linesep}" +
|
||||
(f" Next Chunks: {', '.join(str(n) for n in self.next_chunks)}{os.linesep}" if self.next_chunks else f" End Chunk{os.linesep}") +
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
class BitVector:
|
||||
def __init__(self, length: int, init: bool = False) -> None:
|
||||
self.__bits: Dict[int, bool] = {i: init for i in range(length)}
|
||||
|
||||
def clone(self) -> "BitVector":
|
||||
new = BitVector(len(self.__bits))
|
||||
new.__bits = {i: self.__bits[i] for i in self.__bits}
|
||||
return new
|
||||
|
||||
def setAllBitsTo(self, val: bool) -> "BitVector":
|
||||
self.__bits = {i: val for i in self.__bits}
|
||||
return self
|
||||
|
||||
def setBit(self, bit: int) -> "BitVector":
|
||||
self.__bits[bit] = True
|
||||
return self
|
||||
|
||||
def clearBit(self, bit: int) -> "BitVector":
|
||||
self.__bits[bit] = False
|
||||
return self
|
||||
|
||||
def orVector(self, other: "BitVector") -> "BitVector":
|
||||
if len(self.__bits) != len(other.__bits):
|
||||
raise Exception("Cannot or different-sized bitvectors!")
|
||||
self.__bits = {i: (self.__bits[i] or other.__bits[i]) for i in self.__bits}
|
||||
return self
|
||||
|
||||
def andVector(self, other: "BitVector") -> "BitVector":
|
||||
if len(self.__bits) != len(other.__bits):
|
||||
raise Exception("Cannot and different-sized bitvectors!")
|
||||
self.__bits = {i: (self.__bits[i] and other.__bits[i]) for i in self.__bits}
|
||||
return self
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, BitVector):
|
||||
return NotImplemented
|
||||
if len(self.__bits) != len(other.__bits):
|
||||
raise Exception("Cannot compare different-sized bitvectors!")
|
||||
|
||||
for i in self.__bits:
|
||||
if self.__bits[i] != other.__bits[i]:
|
||||
return False
|
||||
return True
|
||||
|
||||
def __ne__(self, other: object) -> bool:
|
||||
return not self.__eq__(other)
|
||||
|
||||
@property
|
||||
def bitsSet(self) -> Set[int]:
|
||||
return {i for i in self.__bits if self.__bits[i]}
|
||||
|
||||
|
||||
class ByteCodeDecompiler(VerboseOutput):
|
||||
@ -250,6 +359,7 @@ class ByteCodeDecompiler(VerboseOutput):
|
||||
|
||||
# Finally, return chunks of contiguous execution.
|
||||
chunks: List[ByteCodeChunk] = []
|
||||
chunkid: int = 0
|
||||
for start, flow in flows.items():
|
||||
if start == end:
|
||||
# We don't want to render out the end of the graph, it was only there to make
|
||||
@ -258,25 +368,188 @@ class ByteCodeDecompiler(VerboseOutput):
|
||||
|
||||
if len(flow.next_flow) == 1 and flow.next_flow[0] == end:
|
||||
# This flow is a termination state.
|
||||
chunks.append(ByteCodeChunk(self.bytecode.actions[flow.beginning:flow.end], []))
|
||||
chunks.append(ByteCodeChunk(chunkid, self.bytecode.actions[flow.beginning:flow.end], []))
|
||||
chunkid += 1
|
||||
else:
|
||||
next_chunks: List[int] = []
|
||||
for ano in flow.next_flow:
|
||||
if ano == end:
|
||||
raise Exception("Logic error!")
|
||||
next_chunks.append(self.bytecode.actions[ano].offset)
|
||||
chunks.append(ByteCodeChunk(self.bytecode.actions[flow.beginning:flow.end], next_chunks))
|
||||
chunks.append(ByteCodeChunk(chunkid, self.bytecode.actions[flow.beginning:flow.end], next_chunks))
|
||||
chunkid += 1
|
||||
|
||||
return sorted(chunks, key=lambda c: c.offset)
|
||||
# Calculate who points to us as well, for posterity.
|
||||
entries: Dict[int, List[int]] = {}
|
||||
offset_to_id: Dict[int, int] = {}
|
||||
for chunk in chunks:
|
||||
offset_to_id[chunk.offset] = chunk.id
|
||||
for next_chunk in chunk.next_chunks:
|
||||
entries[next_chunk] = entries.get(next_chunk, []) + [chunk.offset]
|
||||
|
||||
def decompile(self, verbose: bool = False) -> str:
|
||||
with self.debugging(verbose):
|
||||
return self.__decompile()
|
||||
for chunk in chunks:
|
||||
chunk.previous_chunks = entries.get(chunk.offset, [])
|
||||
|
||||
# Now, convert the offsets to chunk ID pointers.
|
||||
end_previous_chunks: List[int] = []
|
||||
for chunk in chunks:
|
||||
if chunk.next_chunks:
|
||||
# Normal chunk.
|
||||
chunk.next_chunks = [offset_to_id[c] for c in chunk.next_chunks]
|
||||
else:
|
||||
# Point this chunk at the end of bytecode sentinel.
|
||||
chunk.next_chunks = [chunkid]
|
||||
end_previous_chunks.append(chunk.id)
|
||||
chunk.previous_chunks = [offset_to_id[c] for c in chunk.previous_chunks]
|
||||
|
||||
chunks.append(ByteCodeChunk(chunkid, [], [], previous_chunks=end_previous_chunks))
|
||||
return sorted(chunks, key=lambda c: c.id)
|
||||
|
||||
def __get_entry_block(self, chunks: List[ByteCodeChunk]) -> int:
|
||||
start_id: int = -1
|
||||
for chunk in chunks:
|
||||
if not chunk.previous_chunks:
|
||||
if start_id != -1:
|
||||
# This should never happen, we have one entrypoint. If we run into
|
||||
# this we might need to do dead code analysis and discarding.
|
||||
raise Exception("Logic error!")
|
||||
start_id = chunk.id
|
||||
|
||||
if start_id == -1:
|
||||
# We should never get to this as we always have at least one entrypoint.
|
||||
raise Exception("Logic error!")
|
||||
return start_id
|
||||
|
||||
def __compute_dominators(self, chunks: List[ByteCodeChunk]) -> Dict[int, Set[int]]:
|
||||
# Find the start of the graph (the node with no previous entries).
|
||||
start_id = self.__get_entry_block(chunks)
|
||||
|
||||
# Compute dominators recursively
|
||||
chunklen = len(chunks)
|
||||
dominators: Dict[int, BitVector] = {chunk.id: BitVector(chunklen, init=True) for chunk in chunks}
|
||||
dominators[start_id].setAllBitsTo(False).setBit(start_id)
|
||||
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for chunk in chunks:
|
||||
if chunk.id == start_id:
|
||||
continue
|
||||
|
||||
for previd in chunk.previous_chunks:
|
||||
comparison = dominators[chunk.id].clone()
|
||||
dominators[chunk.id].andVector(dominators[previd]).setBit(chunk.id)
|
||||
if dominators[chunk.id] != comparison:
|
||||
changed = True
|
||||
|
||||
return {chunk.id: dominators[chunk.id].bitsSet for chunk in chunks}
|
||||
|
||||
def __separate_loops(self, chunks: List[ByteCodeChunk], dominators: Dict[int, Set[int]]) -> List[Union[ByteCodeChunk, Loop]]:
|
||||
# Find the start of the graph (the node with no previous entries).
|
||||
start_id = self.__get_entry_block(chunks)
|
||||
chunks_by_id: Dict[int, Union[ByteCodeChunk, Loop]] = {chunk.id: chunk for chunk in chunks}
|
||||
|
||||
# Go through and gather up all loops in the chunks.
|
||||
loops: Dict[int, Set[int]] = {}
|
||||
for chunk in chunks:
|
||||
if chunk.id == start_id:
|
||||
continue
|
||||
|
||||
for nextid in chunk.next_chunks:
|
||||
# If this next chunk dominates us, then that means we found a loop.
|
||||
if nextid in dominators[chunk.id]:
|
||||
# Calculate the blocks that are in this loop.
|
||||
header = nextid
|
||||
tail = chunk.id
|
||||
blocks = {header}
|
||||
|
||||
# If we don't already have a loop of one block,
|
||||
# we need to walk backwards to find all blocks in this
|
||||
# loop.
|
||||
if header != tail:
|
||||
blocks.add(tail)
|
||||
blocks_to_examine = [tail]
|
||||
|
||||
while blocks_to_examine:
|
||||
block = blocks_to_examine.pop()
|
||||
for predecessor in chunks_by_id[block].previous_chunks:
|
||||
if predecessor not in blocks:
|
||||
blocks.add(predecessor)
|
||||
blocks_to_examine.append(predecessor)
|
||||
|
||||
self.vprint(f"Found loop with header {header} and blocks {', '.join(str(b) for b in blocks)}.")
|
||||
|
||||
# We found a loop!
|
||||
if header in loops:
|
||||
raise Exception("Logic error!")
|
||||
loops[header] = blocks
|
||||
|
||||
# Now, we need to reduce our list of chunks down to non-loops only. We do this
|
||||
# by recursively trying to find inner loops until we find a loop that has no
|
||||
# inner loops, and converting that. Once we do that, we remove the chunks from
|
||||
# our list, add it to that new loop, and convert all other loops that might
|
||||
# reference it to point at the loop instead.
|
||||
while loops:
|
||||
delete_header: Optional[int] = None
|
||||
delete_blocks: Set[int] = set()
|
||||
for header, blocks in loops.items():
|
||||
# See if any of the blocks in this loop are the header of any other loop.
|
||||
for block in blocks:
|
||||
if block in loops and loops[block] is not blocks:
|
||||
# This particular block of code is the header of another loop,
|
||||
# so we shouldn't convert this loop until we handle the inner
|
||||
# loop.
|
||||
break
|
||||
else:
|
||||
# This loop does not contain any loops of its own. It is safe to
|
||||
# convert.
|
||||
self.vprint(f"Converting loop with header {header} and blocks {', '.join(str(b) for b in blocks)}.")
|
||||
chunks_by_id[header] = Loop(header, [chunks_by_id[i] for i in blocks])
|
||||
|
||||
# These blocks are now part of the loop, so we need to remove them
|
||||
# from the IDed chunks as well as from existing loops.
|
||||
delete_blocks = {block for block in blocks if block != header}
|
||||
delete_header = header
|
||||
break
|
||||
|
||||
if delete_header is None:
|
||||
# We must find at LEAST one loop that has no inner loops of its own.
|
||||
raise Exception("Logic error!")
|
||||
|
||||
# Remove this loop from the processing list
|
||||
del loops[delete_header]
|
||||
|
||||
# Go through and remove the rest of the chunks from the rest of the loops
|
||||
loops = {header: {block for block in blocks if block not in delete_blocks} for (header, blocks) in loops.items()}
|
||||
|
||||
# Also remove the rest of the chunks from our IDed chunks as they are part of this loop now.
|
||||
for block in delete_blocks:
|
||||
del chunks_by_id[block]
|
||||
|
||||
# Verify that we don't have any existing chunks that point at the non-header portion of the loop.
|
||||
for _, chunk_or_loop in chunks_by_id.items():
|
||||
for nextid in chunk_or_loop.next_chunks:
|
||||
if nextid in delete_blocks:
|
||||
# Woah, we point at a chunk inside this loop that isn't the header!
|
||||
raise Exception("Logic error!")
|
||||
|
||||
return [chunks_by_id[i] for i in chunks_by_id]
|
||||
|
||||
def __decompile(self) -> str:
|
||||
# First, we need to construct a control flow graph.
|
||||
chunks = self.__graph_control_flow()
|
||||
|
||||
self.vprint(chunks)
|
||||
# Now, compute dominators so we can locate back-refs.
|
||||
dominators = self.__compute_dominators(chunks)
|
||||
|
||||
# Now, separate chunks out into chunks and loops.
|
||||
chunks_and_loops = self.__separate_loops(chunks, dominators)
|
||||
|
||||
self.vprint(chunks_and_loops)
|
||||
|
||||
return "TODO"
|
||||
|
||||
def decompile(self, verbose: bool = False) -> str:
|
||||
with self.debugging(verbose):
|
||||
return self.__decompile()
|
||||
|
Loading…
x
Reference in New Issue
Block a user