Yaml fixes + find_similar_areas update

This commit is contained in:
Ethan Roseman 2022-10-18 23:27:02 +09:00
parent 9e8f3eb36b
commit f59f495a36
No known key found for this signature in database
GPG Key ID: 27F9FCEB8E4969BD
2 changed files with 136 additions and 23 deletions

View File

@ -1,36 +1,41 @@
#!/usr/bin/python3
import argparse
import os
import re
import subprocess
import sys
from collections import OrderedDict
from dataclasses import dataclass
from pathlib import Path
import re
from typing import Optional
from typing import Dict, List, Optional, Tuple
import os
import sys
from sty import fg
script_dir = Path(os.path.dirname(os.path.realpath(__file__)))
root_dir = script_dir / ".."
asm_dir = root_dir / "ver/current/asm/nonmatchings/"
map_file_path = root_dir / "ver/current/build/papermario.map"
build_dir = root_dir / "ver/current/build/"
map_file_path = build_dir / "papermario.map"
rom_path = root_dir / "ver/current/baserom.z64"
OBJDUMP = "mips-linux-gnu-objdump"
@dataclass
class Symbol:
name: str
rom_start: int
ram: int
current_file: str
current_file: Path
prev_sym: str
is_decompiled: bool
rom_end: Optional[int] = None
def size(self):
assert(self.rom_end is not None)
assert self.rom_end is not None
return self.rom_end - self.rom_start
@dataclass
class Bytes:
offset: int
@ -42,6 +47,7 @@ def read_rom() -> bytes:
with open(rom_path, "rb") as f:
return f.read()
def get_all_unmatched_functions():
ret = set()
for root, dirs, files in os.walk(asm_dir):
@ -55,7 +61,7 @@ def get_symbol_bytes(func: str) -> Optional[Bytes]:
if func not in syms or syms[func].rom_end is None:
return None
sym = syms[func]
bs = list(rom_bytes[sym.rom_start:sym.rom_end])
bs = list(rom_bytes[sym.rom_start : sym.rom_end])
while len(bs) > 0 and bs[-1] == 0:
bs.pop()
@ -66,7 +72,7 @@ def get_symbol_bytes(func: str) -> Optional[Bytes]:
for ins in insns:
ret.append(ins >> 2)
return Bytes(0, bytes(ret).decode('utf-8'), bs)
return Bytes(0, bytes(ret).decode("utf-8"), bs)
def parse_map() -> OrderedDict[str, Symbol]:
@ -116,7 +122,7 @@ def parse_map() -> OrderedDict[str, Symbol]:
name=fn,
rom_start=rom,
ram=ram,
current_file=cur_file,
current_file=Path(cur_file),
prev_sym=prev_sym,
is_decompiled=not fn in unmatched_functions,
)
@ -150,7 +156,7 @@ class Result:
length: int
def get_pair_matches(query_hashes: list[int], sym_hashes: list[int]) -> list[Match]:
def get_pair_matches(query_hashes: list[str], sym_hashes: list[str]) -> list[Match]:
ret = []
matching_hashes = set(query_hashes).intersection(sym_hashes)
@ -159,7 +165,7 @@ def get_pair_matches(query_hashes: list[int], sym_hashes: list[int]) -> list[Mat
return ret
def get_hashes(bytes: Bytes, window_size: int) -> list[int]:
def get_hashes(bytes: Bytes, window_size: int) -> list[str]:
ret = []
for i in range(0, len(bytes.normalized) - window_size):
ret.append(bytes.normalized[i : i + window_size])
@ -171,7 +177,7 @@ def group_matches(query: str, target: str, matches: list[Match]) -> list[Result]
matches.sort(key=lambda m: m.query_offset)
match_groups = []
match_groups: List[List[Match]] = []
last_start = matches[0].query_offset
for match in matches:
if match.query_offset == last_start + 1:
@ -189,6 +195,74 @@ def group_matches(query: str, target: str, matches: list[Match]) -> list[Result]
return ret
def get_line_numbers(obj_file: Path) -> Dict[int, int]:
ret = {}
objdump_out = (
subprocess.run(
[OBJDUMP, "-WL", obj_file],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
.stdout.decode("utf-8")
.split("\n")
)
if not objdump_out:
return {}
for line in objdump_out[7:]:
if not line:
continue
pieces = line.split()
if len(pieces) < 3:
continue
fn = pieces[0]
if fn == OBJDUMP or fn[0] == "<":
continue
starting_addr = int(pieces[2], 0)
try:
line_num = int(pieces[1])
ret[starting_addr] = line_num
except ValueError:
continue
return ret
def get_tu_offset(obj_file: Path, symbol: str) -> Optional[int]:
objdump = "mips-linux-gnu-objdump"
objdump_out = (
subprocess.run([objdump, "-t", obj_file], stdout=subprocess.PIPE)
.stdout.decode("utf-8")
.split("\n")
)
if not objdump_out:
return None
for line in objdump_out[4:]:
if not line:
continue
pieces = line.split()
if pieces[-1] == symbol:
return int(pieces[0], 16)
return None
def get_c_range(
insn_start: int, insn_end: int, line_numbers: Dict[int, int]
) -> Tuple[Optional[int], Optional[int]]:
start = line_numbers.get(insn_start)
end = line_numbers.get(insn_end)
return start, end
def get_matches(query: str, window_size: int):
query_bytes: Optional[Bytes] = get_symbol_bytes(query)
@ -214,17 +288,46 @@ def get_matches(query: str, window_size: int):
matches: list[Match] = get_pair_matches(query_hashes, sym_hashes)
if matches:
results = group_matches(query, symbol, matches)
obj_file = syms[symbol].current_file
decompiled_str = ""
line_numbers = {}
tu_offset = None
decompiled_str = ":"
if syms[symbol].is_decompiled:
decompiled_str = " (decompiled)"
print(symbol + ":" + decompiled_str)
line_numbers = get_line_numbers(obj_file)
tu_offset = get_tu_offset(obj_file, symbol)
decompiled_str = fg.green + " (decompiled)" + fg.rs + ":"
print(symbol + decompiled_str)
for result in results:
total_len = result.length + window_size
query_str = f"{query} [{result.query_start}-{result.query_start + total_len}]"
target_str = f"{symbol} [{result.target_start}-{result.target_start + total_len}]"
print(f"\t{query_str} matches {target_str} ({total_len})")
query_end = result.query_start + total_len
target_end = result.target_start + total_len
c_start: Optional[int] = None
c_end: Optional[int] = None
if tu_offset is not None and len(line_numbers) > 0:
c_start, c_end = get_c_range(
tu_offset + (result.target_start * 4),
tu_offset + (target_end * 4),
line_numbers,
)
target_range_str = ""
if c_start is not None or c_end is not None:
start_str = c_start if c_start is not None else "?"
end_str = c_end if c_end is not None else "?"
target_range_str = (
fg.li_cyan + f" (line {start_str}-{end_str} in {obj_file.stem})" + fg.rs
)
query_str = f"{query} [{result.query_start}-{query_end}]"
target_str = (
f"{symbol} [{result.target_start}-{target_end}]{target_range_str}"
)
print(f"\t{query_str} matches {target_str} ({total_len} total insns)")
return OrderedDict(sorted(ret.items(), key=lambda kv: kv[1], reverse=True))
@ -232,9 +335,19 @@ def get_matches(query: str, window_size: int):
def do_query(query, window_size):
get_matches(query, window_size)
parser = argparse.ArgumentParser(description="Tool to find duplicate portions of code from one function in code across the codebase")
parser = argparse.ArgumentParser(
description="Tool to find duplicate portions of code from one function in code across the codebase"
)
parser.add_argument("query", help="function")
parser.add_argument("-w", "--window-size", help="number of bytes to compare", type=int, default=20, required=False)
parser.add_argument(
"-w",
"--window-size",
help="number of bytes to compare",
type=int,
default=20,
required=False,
)
args = parser.parse_args()

View File

@ -6680,7 +6680,7 @@ segments:
start: 0x6DDDC0
vram: 0x80218000
subsegments:
- [0x669D80, c, actor/koopa_troopa]
- [0x6DDDC0, c, actor/koopa_troopa]
- [0x6DDE90, c, actor/fuzzy]
- [0x6DE000, c, actor/pokey]
- [0x6DE0D0, c, actor/bandit]
@ -8917,7 +8917,7 @@ segments:
- [0x953FC0, c, sbk_56_1_main]
- [0x954D80, c, sbk_56_2_entity]
- [0x954D80, c, sbk_56_3_foliage]
- [0x954FC00]
- [0x954FC0]
- name: sbk_60
dir: world/area_sbk/sbk_60
type: code