Splat update to 0.7.3 (#283)

* change splat branch to master * git subrepo pull --force tools/splat subrepo: subdir: "tools/splat" merged: "924414a51d" upstream: origin: "https://github.com/ethteck/splat.git" branch: "master" commit: "924414a51d" git-subrepo: version: "0.4.3" origin: "https://github.com/ingydotnet/git-subrepo" commit: "2f68596"
2024-11-08 12:02:30 +01:00 · 2021-04-27 21:36:33 +09:00 · 2021-04-27 21:36:33 +09:00 · ac3797ea56
commit ac3797ea56
parent 1c0d26e6c6
5 changed files with 101 additions and 31 deletions
--- a/tools/splat/.github/workflows/mypy.yml
+++ b/tools/splat/.github/workflows/mypy.yml
@ -0,0 +1,15 @@
+name: mypy
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - uses: jpetrucciani/mypy-check@master
--- a/tools/splat/.gitrepo
+++ b/tools/splat/.gitrepo
@ -5,8 +5,8 @@
 ;
 [subrepo]
 	remote = https://github.com/ethteck/splat.git
-	branch = imgflip
-	commit = 4e012eaad6bffc4da7aed13d9a7f86bbfddf9150
-	parent = 8acbae1ea523f217fc6f2780ee83b37f5f2ac05b
+	branch = master
+	commit = 924414a51d0bcc52076b6ee7147b1bb1d20e804a
+	parent = 7515d21506205b43cccd28875f0d2765addb36ad
 	method = merge
 	cmdver = 0.4.3
--- a/tools/splat/CHANGELOG.md
+++ b/tools/splat/CHANGELOG.md
@ -1,5 +1,11 @@
 # splat Release Notes

+### 0.7.2
+
+* Data disassembly changes:
+  * String detection has been improved. Please send me false positives / negatives as you see them and I can try to improve it further!
+  * Symbols in a data segment pointed to by other symbols will now properly be split out as their own symbols
+
 ### 0.7.1

 * Image segment changes:
--- a/tools/splat/segtypes/n64/data.py
+++ b/tools/splat/segtypes/n64/data.py
@ -1,6 +1,6 @@
 from segtypes.n64.codesubsegment import N64SegCodeSubsegment
 from pathlib import Path
-from typing import Optional
+from typing import Dict, Optional
 from util.symbols import Symbol
 from util import floats, options

@ -33,14 +33,21 @@ class N64SegData(N64SegCodeSubsegment):
    def get_linker_section(self) -> str:
        return ".data"

-    def get_symbols(self):
-        ret = []
+    def get_symbols(self, rom_bytes):
+        symset = set()
+
+        # Find inter-data symbols
+        for i in range(self.rom_start, self.rom_end, 4):
+            bits = int.from_bytes(rom_bytes[i : i + 4], "big")
+            if self.contains_vram(bits):
+                symset.add(self.parent.get_symbol(bits, create=True, define=True, local_only=True))

        for symbol_addr in self.seg_symbols:
            for symbol in self.seg_symbols[symbol_addr]:
                if not symbol.dead and self.contains_vram(symbol.vram_start):
-                    ret.append(symbol)
+                    symset.add(symbol)

+        ret = list(symset)
        ret.sort(key=lambda s:s.vram_start)

        # Ensure we start at the beginning
@ -52,21 +59,75 @@ class N64SegData(N64SegCodeSubsegment):

        return ret

+    def are_null(chars):
+        for b in chars:
+            if b != '\x00':
+                return False
+        return True
+
    @staticmethod
    def is_valid_ascii(bytes):
-        if len(bytes) < 8:
+        null_char = '\x00'
+        valid_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890[]():%!#=-_ "
+        invalid_chars = ""
+        duplicate_limit = 10
+
+        last_char = 0
+        true_end = None
+        consecutive_duplicates = 0
+        valid_count = 0
+
+        if len(bytes) <= 4 or bytes[0] == 0:
            return False

-        num_empty_bytes = 0
-        for b in bytes:
-            if b == 0:
-                num_empty_bytes += 1
-
-        empty_ratio = num_empty_bytes / len(bytes)
-        if empty_ratio > 0.2:
+        try:
+            chars = bytes.decode("EUC-JP")
+        except:
            return False

-        return True
+        if len(chars) <= 4:
+            return False
+
+        for i, c in enumerate(chars):
+            # Ensure null bytes are only at the end of ascii strings
+            # TODO: if we find null bytes in the middle, break this into multiple strings ?
+            if c == null_char:
+                if true_end is None:
+                    if N64SegData.are_null(chars[i:]):
+                        true_end = i
+                    else:
+                        pass
+                        #return False
+
+            # Ensure we're not seeing a ton of the same character in a row
+            if last_char == c:
+                consecutive_duplicates += 1
+                if consecutive_duplicates >= duplicate_limit and last_char != null_char:
+                    return False
+            else:
+                consecutive_duplicates = 0
+
+            if c in valid_chars:
+                valid_count += 1
+            elif c in invalid_chars:
+                return False
+
+            last_char = c
+
+        # Ensure the number of valid characters is sufficient
+        if true_end is not None:
+            # If there are more than 16 null chars at the end, something is afoot
+            if len(chars) - true_end > 16:
+                return False
+            end = true_end
+        else:
+            end = len(chars)
+
+        valid_ratio = valid_count / end
+        if valid_ratio >= 0.75:
+            return True
+
+        return False
    
    def disassemble_symbol(self, sym_bytes, sym_type):
        if sym_type == "jtbl":
@ -86,9 +147,10 @@ class N64SegData(N64SegCodeSubsegment):
        if sym_type == "ascii":
            try:
                ascii_str = sym_bytes.decode("EUC-JP")
-                ascii_str = ascii_str.replace("\\", "\\\\")
+                # ascii_str = ascii_str.rstrip("\x00")
                ascii_str = ascii_str.replace("\x00", "\\0")
                ascii_str = ascii_str.replace("\n", "\\n")
+
                sym_str += f'"{ascii_str}"'
                return sym_str
            except:
@ -148,7 +210,7 @@ class N64SegData(N64SegCodeSubsegment):
        if self.size == 0:
            return None

-        syms = self.get_symbols()
+        syms = self.get_symbols(rom_bytes)

        for i in range(len(syms) - 1):
            mnemonic = syms[i].access_mnemonic
--- a/tools/splat/segtypes/n64/rodata.py
+++ b/tools/splat/segtypes/n64/rodata.py
@ -3,16 +3,3 @@ from segtypes.n64.data import N64SegData
 class N64SegRodata(N64SegData):    
    def get_linker_section(self) -> str:
        return ".rodata"
-
-    def scan(self, rom_bytes: bytes):
-        self.file_text = self.disassemble_data(rom_bytes)
-
-    def split(self, rom_bytes: bytes):
-        if self.file_text:
-            path = self.out_path()
-            
-            if path:
-                path.parent.mkdir(parents=True, exist_ok=True)
-
-                with open(path, "w", newline="\n") as f:
-                    f.write(self.file_text)