Yaml fixes + find_similar_areas update

2024-11-08 12:02:30 +01:00 · 2022-10-18 23:27:02 +09:00 · 2022-10-18 23:27:02 +09:00 · f59f495a36
commit f59f495a36
parent 9e8f3eb36b
2 changed files with 136 additions and 23 deletions
--- a/tools/find_similar_areas.py
+++ b/tools/find_similar_areas.py
@ -1,36 +1,41 @@
 #!/usr/bin/python3

 import argparse
+import os
+import re
+import subprocess
+import sys
 from collections import OrderedDict
 from dataclasses import dataclass
 from pathlib import Path
-import re
-from typing import Optional
+from typing import Dict, List, Optional, Tuple

-import os
-import sys
+from sty import fg

 script_dir = Path(os.path.dirname(os.path.realpath(__file__)))
 root_dir = script_dir / ".."
 asm_dir = root_dir / "ver/current/asm/nonmatchings/"
-map_file_path = root_dir / "ver/current/build/papermario.map"
+build_dir = root_dir / "ver/current/build/"
+map_file_path = build_dir / "papermario.map"
 rom_path = root_dir / "ver/current/baserom.z64"

+OBJDUMP = "mips-linux-gnu-objdump"

@dataclass
 class Symbol:
    name: str
    rom_start: int
    ram: int
-    current_file: str
+    current_file: Path
    prev_sym: str
    is_decompiled: bool
    rom_end: Optional[int] = None

    def size(self):
-        assert(self.rom_end is not None)
+        assert self.rom_end is not None
        return self.rom_end - self.rom_start

+
@dataclass
 class Bytes:
    offset: int
@ -42,6 +47,7 @@ def read_rom() -> bytes:
    with open(rom_path, "rb") as f:
        return f.read()

+
 def get_all_unmatched_functions():
    ret = set()
    for root, dirs, files in os.walk(asm_dir):
@ -55,7 +61,7 @@ def get_symbol_bytes(func: str) -> Optional[Bytes]:
    if func not in syms or syms[func].rom_end is None:
        return None
    sym = syms[func]
-    bs = list(rom_bytes[sym.rom_start:sym.rom_end])
+    bs = list(rom_bytes[sym.rom_start : sym.rom_end])

    while len(bs) > 0 and bs[-1] == 0:
        bs.pop()
@ -66,7 +72,7 @@ def get_symbol_bytes(func: str) -> Optional[Bytes]:
    for ins in insns:
        ret.append(ins >> 2)

-    return Bytes(0, bytes(ret).decode('utf-8'), bs)
+    return Bytes(0, bytes(ret).decode("utf-8"), bs)


 def parse_map() -> OrderedDict[str, Symbol]:
@ -116,7 +122,7 @@ def parse_map() -> OrderedDict[str, Symbol]:
                    name=fn,
                    rom_start=rom,
                    ram=ram,
-                    current_file=cur_file,
+                    current_file=Path(cur_file),
                    prev_sym=prev_sym,
                    is_decompiled=not fn in unmatched_functions,
                )
@ -150,7 +156,7 @@ class Result:
    length: int


-def get_pair_matches(query_hashes: list[int], sym_hashes: list[int]) -> list[Match]:
+def get_pair_matches(query_hashes: list[str], sym_hashes: list[str]) -> list[Match]:
    ret = []

    matching_hashes = set(query_hashes).intersection(sym_hashes)
@ -159,7 +165,7 @@ def get_pair_matches(query_hashes: list[int], sym_hashes: list[int]) -> list[Mat
    return ret


-def get_hashes(bytes: Bytes, window_size: int) -> list[int]:
+def get_hashes(bytes: Bytes, window_size: int) -> list[str]:
    ret = []
    for i in range(0, len(bytes.normalized) - window_size):
        ret.append(bytes.normalized[i : i + window_size])
@ -171,7 +177,7 @@ def group_matches(query: str, target: str, matches: list[Match]) -> list[Result]

    matches.sort(key=lambda m: m.query_offset)

-    match_groups = []
+    match_groups: List[List[Match]] = []
    last_start = matches[0].query_offset
    for match in matches:
        if match.query_offset == last_start + 1:
@ -189,6 +195,74 @@ def group_matches(query: str, target: str, matches: list[Match]) -> list[Result]
    return ret


+def get_line_numbers(obj_file: Path) -> Dict[int, int]:
+    ret = {}
+
+    objdump_out = (
+        subprocess.run(
+            [OBJDUMP, "-WL", obj_file],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+        )
+        .stdout.decode("utf-8")
+        .split("\n")
+    )
+
+    if not objdump_out:
+        return {}
+
+    for line in objdump_out[7:]:
+        if not line:
+            continue
+        pieces = line.split()
+
+        if len(pieces) < 3:
+            continue
+
+        fn = pieces[0]
+
+        if fn == OBJDUMP or fn[0] == "<":
+            continue
+
+        starting_addr = int(pieces[2], 0)
+        try:
+            line_num = int(pieces[1])
+            ret[starting_addr] = line_num
+        except ValueError:
+            continue
+    return ret
+
+
+def get_tu_offset(obj_file: Path, symbol: str) -> Optional[int]:
+    objdump = "mips-linux-gnu-objdump"
+
+    objdump_out = (
+        subprocess.run([objdump, "-t", obj_file], stdout=subprocess.PIPE)
+        .stdout.decode("utf-8")
+        .split("\n")
+    )
+
+    if not objdump_out:
+        return None
+
+    for line in objdump_out[4:]:
+        if not line:
+            continue
+        pieces = line.split()
+
+        if pieces[-1] == symbol:
+            return int(pieces[0], 16)
+    return None
+
+
+def get_c_range(
+    insn_start: int, insn_end: int, line_numbers: Dict[int, int]
+) -> Tuple[Optional[int], Optional[int]]:
+    start = line_numbers.get(insn_start)
+    end = line_numbers.get(insn_end)
+    return start, end
+
+
 def get_matches(query: str, window_size: int):
    query_bytes: Optional[Bytes] = get_symbol_bytes(query)

@ -214,17 +288,46 @@ def get_matches(query: str, window_size: int):
        matches: list[Match] = get_pair_matches(query_hashes, sym_hashes)
        if matches:
            results = group_matches(query, symbol, matches)
+            obj_file = syms[symbol].current_file

-            decompiled_str = ""
+            line_numbers = {}
+            tu_offset = None
+            decompiled_str = ":"
            if syms[symbol].is_decompiled:
-                decompiled_str = " (decompiled)"
-            print(symbol + ":" + decompiled_str)
+                line_numbers = get_line_numbers(obj_file)
+                tu_offset = get_tu_offset(obj_file, symbol)
+                decompiled_str = fg.green + " (decompiled)" + fg.rs + ":"
+
+            print(symbol + decompiled_str)

            for result in results:
                total_len = result.length + window_size
-                query_str = f"{query} [{result.query_start}-{result.query_start + total_len}]"
-                target_str = f"{symbol} [{result.target_start}-{result.target_start + total_len}]"
-                print(f"\t{query_str} matches {target_str} ({total_len})")
+                query_end = result.query_start + total_len
+                target_end = result.target_start + total_len
+
+                c_start: Optional[int] = None
+                c_end: Optional[int] = None
+                if tu_offset is not None and len(line_numbers) > 0:
+                    c_start, c_end = get_c_range(
+                        tu_offset + (result.target_start * 4),
+                        tu_offset + (target_end * 4),
+                        line_numbers,
+                    )
+
+                target_range_str = ""
+                if c_start is not None or c_end is not None:
+                    start_str = c_start if c_start is not None else "?"
+                    end_str = c_end if c_end is not None else "?"
+
+                    target_range_str = (
+                        fg.li_cyan + f" (line {start_str}-{end_str} in {obj_file.stem})" + fg.rs
+                    )
+
+                query_str = f"{query} [{result.query_start}-{query_end}]"
+                target_str = (
+                    f"{symbol} [{result.target_start}-{target_end}]{target_range_str}"
+                )
+                print(f"\t{query_str} matches {target_str} ({total_len} total insns)")

    return OrderedDict(sorted(ret.items(), key=lambda kv: kv[1], reverse=True))

@ -232,9 +335,19 @@ def get_matches(query: str, window_size: int):
 def do_query(query, window_size):
    get_matches(query, window_size)

-parser = argparse.ArgumentParser(description="Tool to find duplicate portions of code from one function in code across the codebase")
+
+parser = argparse.ArgumentParser(
+    description="Tool to find duplicate portions of code from one function in code across the codebase"
+)
 parser.add_argument("query", help="function")
-parser.add_argument("-w", "--window-size", help="number of bytes to compare", type=int, default=20, required=False)
+parser.add_argument(
+    "-w",
+    "--window-size",
+    help="number of bytes to compare",
+    type=int,
+    default=20,
+    required=False,
+)

 args = parser.parse_args()

--- a/ver/us/splat.yaml
+++ b/ver/us/splat.yaml
@ -6680,7 +6680,7 @@ segments:
    start: 0x6DDDC0
    vram: 0x80218000
    subsegments:
-    - [0x669D80, c, actor/koopa_troopa]
+    - [0x6DDDC0, c, actor/koopa_troopa]
    - [0x6DDE90, c, actor/fuzzy]
    - [0x6DE000, c, actor/pokey]
    - [0x6DE0D0, c, actor/bandit]
@ -8917,7 +8917,7 @@ segments:
    - [0x953FC0, c, sbk_56_1_main]
    - [0x954D80, c, sbk_56_2_entity]
    - [0x954D80, c, sbk_56_3_foliage]
-    - [0x954FC00]
+    - [0x954FC0]
  - name: sbk_60
    dir: world/area_sbk/sbk_60
    type: code