2023-05-04 11:03:02 +02:00
|
|
|
#!/usr/bin/env python3
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
import argparse
|
2022-10-18 16:27:02 +02:00
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
import sys
|
2022-10-04 16:09:23 +02:00
|
|
|
from collections import OrderedDict
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from pathlib import Path
|
2022-10-18 16:27:02 +02:00
|
|
|
from typing import Dict, List, Optional, Tuple
|
2022-10-04 16:09:23 +02:00
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
from sty import fg
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
script_dir = Path(os.path.dirname(os.path.realpath(__file__)))
|
|
|
|
root_dir = script_dir / ".."
|
|
|
|
asm_dir = root_dir / "ver/current/asm/nonmatchings/"
|
2022-10-18 16:27:02 +02:00
|
|
|
build_dir = root_dir / "ver/current/build/"
|
2022-10-25 12:04:54 +02:00
|
|
|
elf_path = build_dir / "papermario.elf"
|
2022-10-18 16:27:02 +02:00
|
|
|
map_file_path = build_dir / "papermario.map"
|
2022-10-04 16:09:23 +02:00
|
|
|
rom_path = root_dir / "ver/current/baserom.z64"
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
OBJDUMP = "mips-linux-gnu-objdump"
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Symbol:
|
|
|
|
name: str
|
|
|
|
rom_start: int
|
|
|
|
ram: int
|
2022-10-18 16:27:02 +02:00
|
|
|
current_file: Path
|
2022-10-04 16:09:23 +02:00
|
|
|
prev_sym: str
|
|
|
|
is_decompiled: bool
|
|
|
|
rom_end: Optional[int] = None
|
|
|
|
|
|
|
|
def size(self):
|
2022-10-18 16:27:02 +02:00
|
|
|
assert self.rom_end is not None
|
2022-10-04 16:09:23 +02:00
|
|
|
return self.rom_end - self.rom_start
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
|
2022-10-04 16:09:23 +02:00
|
|
|
@dataclass
|
|
|
|
class Bytes:
|
|
|
|
offset: int
|
|
|
|
normalized: str
|
2023-04-17 21:51:16 +02:00
|
|
|
bytes: bytes
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
def read_rom() -> bytes:
|
|
|
|
with open(rom_path, "rb") as f:
|
|
|
|
return f.read()
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
|
2022-10-04 16:09:23 +02:00
|
|
|
def get_all_unmatched_functions():
|
|
|
|
ret = set()
|
|
|
|
for root, dirs, files in os.walk(asm_dir):
|
|
|
|
for f in files:
|
|
|
|
if f.endswith(".s"):
|
|
|
|
ret.add(f[:-2])
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
2022-10-25 12:04:54 +02:00
|
|
|
def get_func_sizes() -> Dict[str, int]:
|
|
|
|
try:
|
|
|
|
result = subprocess.run(['mips-linux-gnu-objdump', '-x', elf_path], stdout=subprocess.PIPE)
|
|
|
|
nm_lines = result.stdout.decode().split("\n")
|
|
|
|
except:
|
|
|
|
print(f"Error: Could not run objdump on {elf_path} - make sure that the project is built")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
sizes: Dict[str, int] = {}
|
|
|
|
|
|
|
|
for line in nm_lines:
|
|
|
|
if " F " in line:
|
|
|
|
components = line.split()
|
|
|
|
size = int(components[4], 16)
|
|
|
|
name = components[5]
|
|
|
|
sizes[name] = size
|
|
|
|
|
|
|
|
return sizes
|
|
|
|
|
2022-10-04 16:09:23 +02:00
|
|
|
def get_symbol_bytes(func: str) -> Optional[Bytes]:
|
|
|
|
if func not in syms or syms[func].rom_end is None:
|
|
|
|
return None
|
|
|
|
sym = syms[func]
|
2022-10-18 16:27:02 +02:00
|
|
|
bs = list(rom_bytes[sym.rom_start : sym.rom_end])
|
2022-10-04 16:09:23 +02:00
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
# trim nops
|
2022-10-04 16:09:23 +02:00
|
|
|
while len(bs) > 0 and bs[-1] == 0:
|
|
|
|
bs.pop()
|
|
|
|
|
|
|
|
insns = bs[0::4]
|
|
|
|
|
|
|
|
ret = []
|
|
|
|
for ins in insns:
|
|
|
|
ret.append(ins >> 2)
|
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
return Bytes(0, bytes(ret).decode("utf-8"), rom_bytes[sym.rom_start : sym.rom_end])
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
def parse_map() -> OrderedDict[str, Symbol]:
|
|
|
|
ram_offset = None
|
|
|
|
cur_file = "<no file>"
|
|
|
|
syms: OrderedDict[str, Symbol] = OrderedDict()
|
|
|
|
prev_sym = ""
|
|
|
|
prev_line = ""
|
2022-10-18 13:43:04 +02:00
|
|
|
cur_sect = ""
|
|
|
|
sect_re = re.compile(r"\(\..*\)")
|
2022-10-04 16:09:23 +02:00
|
|
|
with open(map_file_path) as f:
|
|
|
|
for line in f:
|
2022-10-18 13:43:04 +02:00
|
|
|
sect = sect_re.search(line)
|
|
|
|
if sect:
|
|
|
|
sect_str = sect.group(0)
|
2023-02-18 10:03:57 +01:00
|
|
|
if sect_str in ["(.text*)", "(.data*)", "(.rodata*)", "(.bss*)"]:
|
2022-10-18 13:43:04 +02:00
|
|
|
cur_sect = sect_str
|
|
|
|
|
2022-10-04 16:09:23 +02:00
|
|
|
if "load address" in line:
|
|
|
|
if "noload" in line or "noload" in prev_line:
|
|
|
|
ram_offset = None
|
|
|
|
continue
|
|
|
|
ram = int(line[16 : 16 + 18], 0)
|
|
|
|
rom = int(line[59 : 59 + 18], 0)
|
|
|
|
ram_offset = ram - rom
|
|
|
|
continue
|
|
|
|
prev_line = line
|
|
|
|
|
|
|
|
if (
|
|
|
|
ram_offset is None
|
|
|
|
or "=" in line
|
|
|
|
or "*fill*" in line
|
|
|
|
or " 0x" not in line
|
|
|
|
):
|
|
|
|
continue
|
|
|
|
ram = int(line[16 : 16 + 18], 0)
|
|
|
|
rom = ram - ram_offset
|
|
|
|
fn = line.split()[-1]
|
|
|
|
if "0x" in fn:
|
|
|
|
ram_offset = None
|
|
|
|
elif "/" in fn:
|
|
|
|
cur_file = fn
|
|
|
|
else:
|
2023-02-18 10:03:57 +01:00
|
|
|
if cur_sect != "(.text*)":
|
2022-10-18 13:43:04 +02:00
|
|
|
continue
|
2022-10-25 12:04:54 +02:00
|
|
|
new_sym = Symbol(
|
2022-10-04 16:09:23 +02:00
|
|
|
name=fn,
|
|
|
|
rom_start=rom,
|
|
|
|
ram=ram,
|
2022-10-18 16:27:02 +02:00
|
|
|
current_file=Path(cur_file),
|
2022-10-04 16:09:23 +02:00
|
|
|
prev_sym=prev_sym,
|
|
|
|
is_decompiled=not fn in unmatched_functions,
|
|
|
|
)
|
2022-10-25 12:04:54 +02:00
|
|
|
if fn in func_sizes:
|
|
|
|
new_sym.rom_end = rom + func_sizes[fn]
|
|
|
|
syms[fn] = new_sym
|
2022-10-04 16:09:23 +02:00
|
|
|
prev_sym = fn
|
|
|
|
|
|
|
|
# Calc end offsets
|
|
|
|
for sym in syms:
|
|
|
|
prev_sym = syms[sym].prev_sym
|
2022-10-25 12:04:54 +02:00
|
|
|
if prev_sym and not syms[prev_sym].rom_end:
|
2022-10-04 16:09:23 +02:00
|
|
|
syms[prev_sym].rom_end = syms[sym].rom_start
|
|
|
|
|
|
|
|
return syms
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Match:
|
|
|
|
query_offset: int
|
|
|
|
target_offset: int
|
|
|
|
length: int
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return f"{self.query_offset} {self.target_offset} {self.length}"
|
|
|
|
|
2022-10-11 23:13:08 +02:00
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Result:
|
|
|
|
query: str
|
|
|
|
target: str
|
|
|
|
query_start: int
|
|
|
|
target_start: int
|
|
|
|
length: int
|
|
|
|
|
2022-11-01 08:41:14 +01:00
|
|
|
@property
|
|
|
|
def query_end(self):
|
|
|
|
return self.query_start + self.length
|
|
|
|
|
|
|
|
@property
|
|
|
|
def target_end(self):
|
|
|
|
return self.target_start + self.length
|
|
|
|
|
2022-10-11 23:13:08 +02:00
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
def get_pair_matches(query_hashes: list[str], sym_hashes: list[str]) -> list[Match]:
|
2022-10-04 16:09:23 +02:00
|
|
|
ret = []
|
|
|
|
|
|
|
|
matching_hashes = set(query_hashes).intersection(sym_hashes)
|
|
|
|
for hash in matching_hashes:
|
|
|
|
ret.append(Match(query_hashes.index(hash), sym_hashes.index(hash), 1))
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
def get_hashes(bytes: Bytes, window_size: int) -> list[str]:
|
2022-10-04 16:09:23 +02:00
|
|
|
ret = []
|
|
|
|
for i in range(0, len(bytes.normalized) - window_size):
|
|
|
|
ret.append(bytes.normalized[i : i + window_size])
|
|
|
|
return ret
|
|
|
|
|
2022-10-11 23:13:08 +02:00
|
|
|
|
2022-11-01 08:41:14 +01:00
|
|
|
def group_matches(query: str, target: str, matches: list[Match], window_size: int,
|
2023-03-05 18:07:51 +01:00
|
|
|
min: Optional[int], max: Optional[int], contains: Optional[int]) -> list[Result]:
|
2022-10-11 23:13:08 +02:00
|
|
|
ret = []
|
|
|
|
|
|
|
|
matches.sort(key=lambda m: m.query_offset)
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
match_groups: List[List[Match]] = []
|
2022-10-11 23:13:08 +02:00
|
|
|
last_start = matches[0].query_offset
|
|
|
|
for match in matches:
|
|
|
|
if match.query_offset == last_start + 1:
|
|
|
|
match_groups[-1].append(match)
|
|
|
|
else:
|
|
|
|
match_groups.append([match])
|
|
|
|
last_start = match.query_offset
|
|
|
|
|
|
|
|
for group in match_groups:
|
|
|
|
query_start = group[0].query_offset
|
|
|
|
target_start = group[0].target_offset
|
2022-11-01 08:41:14 +01:00
|
|
|
length = len(group) + window_size
|
|
|
|
|
|
|
|
if min is not None and query_start + length < min:
|
|
|
|
continue
|
|
|
|
if max is not None and query_start > max:
|
|
|
|
continue
|
2023-03-05 18:07:51 +01:00
|
|
|
if contains is not None and (query_start > contains or query_start + length < contains):
|
|
|
|
continue
|
2022-11-01 08:41:14 +01:00
|
|
|
|
2022-10-11 23:13:08 +02:00
|
|
|
ret.append(Result(query, target, query_start, target_start, length))
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
def get_line_numbers(obj_file: Path) -> Dict[int, int]:
|
|
|
|
ret = {}
|
|
|
|
|
|
|
|
objdump_out = (
|
|
|
|
subprocess.run(
|
|
|
|
[OBJDUMP, "-WL", obj_file],
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.DEVNULL,
|
|
|
|
)
|
|
|
|
.stdout.decode("utf-8")
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
|
|
|
|
if not objdump_out:
|
|
|
|
return {}
|
|
|
|
|
|
|
|
for line in objdump_out[7:]:
|
|
|
|
if not line:
|
|
|
|
continue
|
|
|
|
pieces = line.split()
|
|
|
|
|
|
|
|
if len(pieces) < 3:
|
|
|
|
continue
|
|
|
|
|
|
|
|
fn = pieces[0]
|
|
|
|
|
|
|
|
if fn == OBJDUMP or fn[0] == "<":
|
|
|
|
continue
|
|
|
|
|
|
|
|
starting_addr = int(pieces[2], 0)
|
|
|
|
try:
|
|
|
|
line_num = int(pieces[1])
|
|
|
|
ret[starting_addr] = line_num
|
|
|
|
except ValueError:
|
|
|
|
continue
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
|
|
|
def get_tu_offset(obj_file: Path, symbol: str) -> Optional[int]:
|
|
|
|
objdump = "mips-linux-gnu-objdump"
|
|
|
|
|
|
|
|
objdump_out = (
|
|
|
|
subprocess.run([objdump, "-t", obj_file], stdout=subprocess.PIPE)
|
|
|
|
.stdout.decode("utf-8")
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
|
|
|
|
if not objdump_out:
|
|
|
|
return None
|
|
|
|
|
|
|
|
for line in objdump_out[4:]:
|
|
|
|
if not line:
|
|
|
|
continue
|
|
|
|
pieces = line.split()
|
|
|
|
|
|
|
|
if pieces[-1] == symbol:
|
|
|
|
return int(pieces[0], 16)
|
|
|
|
return None
|
|
|
|
|
2022-10-25 12:04:54 +02:00
|
|
|
@dataclass
|
|
|
|
class CRange():
|
|
|
|
start: Optional[int] = None
|
|
|
|
end: Optional[int] = None
|
|
|
|
start_exact = False
|
|
|
|
end_exact = False
|
|
|
|
|
|
|
|
def has_info(self):
|
|
|
|
return self.start is not None or self.end is not None
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
start_str = "?"
|
|
|
|
end_str = "?"
|
|
|
|
|
|
|
|
if self.start is not None:
|
|
|
|
if self.start_exact:
|
|
|
|
start_str = f"{self.start}"
|
|
|
|
else:
|
|
|
|
start_str = f"~{self.start}"
|
|
|
|
|
|
|
|
if self.end is not None:
|
|
|
|
if self.end_exact:
|
|
|
|
end_str = f"{self.end}"
|
|
|
|
else:
|
|
|
|
end_str = f"~{self.end}"
|
|
|
|
|
|
|
|
return f"{start_str} - {end_str}"
|
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
|
2022-10-25 12:04:54 +02:00
|
|
|
def get_c_range(insn_start: int, insn_end: int, line_numbers: Dict[int, int]) -> CRange:
|
|
|
|
range = CRange()
|
|
|
|
|
|
|
|
if insn_start in line_numbers:
|
|
|
|
range.start = line_numbers[insn_start]
|
|
|
|
range.start_exact = True
|
|
|
|
else:
|
|
|
|
keys = list(line_numbers.keys())
|
|
|
|
for i, key in enumerate(keys[:-1]):
|
|
|
|
if keys[i + 1] > insn_start:
|
|
|
|
range.start = line_numbers[keys[i]]
|
|
|
|
break
|
|
|
|
|
|
|
|
if insn_end in line_numbers:
|
|
|
|
range.end = line_numbers[insn_end]
|
|
|
|
range.end_exact = True
|
|
|
|
else:
|
|
|
|
keys = list(line_numbers.keys())
|
|
|
|
for i, key in enumerate(keys):
|
|
|
|
if key > insn_end:
|
|
|
|
range.end = line_numbers[key]
|
|
|
|
break
|
|
|
|
|
|
|
|
return range
|
2022-10-18 16:27:02 +02:00
|
|
|
|
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
def get_matches(query: str, window_size: int, min: Optional[int], max: Optional[int], contains: Optional[int], show_disasm: bool):
|
2022-10-04 16:09:23 +02:00
|
|
|
query_bytes: Optional[Bytes] = get_symbol_bytes(query)
|
|
|
|
|
|
|
|
if query_bytes is None:
|
|
|
|
sys.exit("Symbol '" + query + "' not found")
|
|
|
|
|
|
|
|
query_hashes = get_hashes(query_bytes, window_size)
|
|
|
|
|
|
|
|
ret: dict[str, float] = {}
|
|
|
|
for symbol in syms:
|
|
|
|
if query == symbol:
|
|
|
|
continue
|
|
|
|
|
|
|
|
sym_bytes: Optional[Bytes] = get_symbol_bytes(symbol)
|
|
|
|
if not sym_bytes:
|
|
|
|
continue
|
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
if len(sym_bytes.bytes) / 4 < window_size:
|
2022-10-04 16:09:23 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
sym_hashes = get_hashes(sym_bytes, window_size)
|
|
|
|
|
|
|
|
matches: list[Match] = get_pair_matches(query_hashes, sym_hashes)
|
2022-11-01 08:41:14 +01:00
|
|
|
if not matches:
|
|
|
|
continue
|
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
results: list[Result] = group_matches(query, symbol, matches, window_size, min, max, contains)
|
2022-11-01 08:41:14 +01:00
|
|
|
if not results:
|
|
|
|
continue
|
|
|
|
|
|
|
|
obj_file = syms[symbol].current_file
|
|
|
|
|
|
|
|
line_numbers = {}
|
|
|
|
tu_offset = None
|
|
|
|
decompiled_str = ":"
|
|
|
|
if syms[symbol].is_decompiled:
|
|
|
|
line_numbers = get_line_numbers(obj_file)
|
|
|
|
tu_offset = get_tu_offset(obj_file, symbol)
|
|
|
|
decompiled_str = fg.green + " (decompiled)" + fg.rs + ":"
|
|
|
|
|
|
|
|
print(symbol + decompiled_str)
|
|
|
|
|
|
|
|
for result in results:
|
|
|
|
c_range = None
|
|
|
|
if tu_offset is not None and len(line_numbers) > 0:
|
|
|
|
c_range = get_c_range(
|
|
|
|
tu_offset + (result.target_start * 4),
|
|
|
|
tu_offset + (result.target_end * 4),
|
|
|
|
line_numbers,
|
2022-10-18 16:27:02 +02:00
|
|
|
)
|
2022-11-01 08:41:14 +01:00
|
|
|
|
|
|
|
target_range_str = ""
|
|
|
|
if c_range:
|
|
|
|
target_range_str = (
|
|
|
|
fg.li_cyan + f" (line {c_range} in {obj_file.stem})" + fg.rs
|
|
|
|
)
|
|
|
|
|
|
|
|
query_str = f"query [{result.query_start}-{result.query_end}]"
|
|
|
|
target_str = (
|
|
|
|
f"{symbol} [insn {result.target_start}-{result.target_end}] ({result.length} total){target_range_str}"
|
|
|
|
)
|
|
|
|
print(f"\t{query_str} matches {target_str}")
|
2022-10-04 16:09:23 +02:00
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
if show_disasm:
|
|
|
|
try:
|
|
|
|
import rabbitizer
|
|
|
|
except ImportError:
|
|
|
|
print("rabbitizer not found, cannot show disassembly")
|
|
|
|
sys.exit(1)
|
|
|
|
result_query_bytes = query_bytes.bytes[result.query_start * 4 : result.query_end * 4]
|
|
|
|
result_target_bytes = sym_bytes.bytes[result.target_start * 4 : result.target_end * 4]
|
|
|
|
|
|
|
|
for i in range(0, len(result_query_bytes), 4):
|
|
|
|
q_insn = rabbitizer.Instruction(int.from_bytes(result_query_bytes[i:i+4], "big"))
|
|
|
|
t_insn = rabbitizer.Instruction(int.from_bytes(result_target_bytes[i:i+4], "big"))
|
|
|
|
|
|
|
|
print(f"\t\t{q_insn.disassemble():35} | {t_insn.disassemble()}")
|
|
|
|
|
2022-10-11 23:13:08 +02:00
|
|
|
return OrderedDict(sorted(ret.items(), key=lambda kv: kv[1], reverse=True))
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
def do_query(query, window_size, min, max, contains, show_disasm):
|
|
|
|
get_matches(query, window_size, min, max, contains, show_disasm)
|
2022-10-04 16:09:23 +02:00
|
|
|
|
2022-10-18 16:27:02 +02:00
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Tool to find duplicate portions of code from one function in code across the codebase"
|
|
|
|
)
|
2022-10-04 16:09:23 +02:00
|
|
|
parser.add_argument("query", help="function")
|
2022-10-18 16:27:02 +02:00
|
|
|
parser.add_argument(
|
|
|
|
"-w",
|
|
|
|
"--window-size",
|
|
|
|
help="number of bytes to compare",
|
|
|
|
type=int,
|
|
|
|
default=20,
|
|
|
|
required=False,
|
|
|
|
)
|
2022-11-01 08:41:14 +01:00
|
|
|
parser.add_argument("--min", help="lower bound of instruction for matches against query", type=int, required=False)
|
|
|
|
parser.add_argument("--max", help="upper bound of instruction for matches against query", type=int, required=False)
|
2023-03-05 18:07:51 +01:00
|
|
|
parser.add_argument("--contains", help="All matches must contain this number'th instruction from the query", type=int, required=False)
|
2023-04-17 21:51:16 +02:00
|
|
|
parser.add_argument("--show-disasm", help="Show disassembly of matches", action="store_true", required=False)
|
2022-10-04 16:09:23 +02:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
rom_bytes = read_rom()
|
|
|
|
unmatched_functions = get_all_unmatched_functions()
|
2022-10-25 12:04:54 +02:00
|
|
|
func_sizes = get_func_sizes()
|
2022-10-04 16:09:23 +02:00
|
|
|
syms = parse_map()
|
|
|
|
|
2023-04-17 21:51:16 +02:00
|
|
|
do_query(args.query, args.window_size, args.min, args.max, args.contains, args.show_disasm)
|