From ac3797ea569334aab05a9af6ea7fb0898c4b9d2d Mon Sep 17 00:00:00 2001
From: Ethan Roseman <ethteck@gmail.com>
Date: Tue, 27 Apr 2021 21:36:33 +0900
Subject: [PATCH] Splat update to 0.7.3 (#283)

* change splat branch to master

* git subrepo pull --force tools/splat

subrepo:
  subdir:   "tools/splat"
  merged:   "924414a51d"
upstream:
  origin:   "https://github.com/ethteck/splat.git"
  branch:   "master"
  commit:   "924414a51d"
git-subrepo:
  version:  "0.4.3"
  origin:   "https://github.com/ingydotnet/git-subrepo"
  commit:   "2f68596"
---
 tools/splat/.github/workflows/mypy.yml | 15 +++++
 tools/splat/.gitrepo                   |  6 +-
 tools/splat/CHANGELOG.md               |  6 ++
 tools/splat/segtypes/n64/data.py       | 92 +++++++++++++++++++++-----
 tools/splat/segtypes/n64/rodata.py     | 13 ----
 5 files changed, 101 insertions(+), 31 deletions(-)
 create mode 100644 tools/splat/.github/workflows/mypy.yml

diff --git a/tools/splat/.github/workflows/mypy.yml b/tools/splat/.github/workflows/mypy.yml
new file mode 100644
index 0000000000..b842ebd82f
--- /dev/null
+++ b/tools/splat/.github/workflows/mypy.yml
@@ -0,0 +1,15 @@
+name: mypy
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - uses: jpetrucciani/mypy-check@master
diff --git a/tools/splat/.gitrepo b/tools/splat/.gitrepo
index 784828a6ee..daa84bdda6 100644
--- a/tools/splat/.gitrepo
+++ b/tools/splat/.gitrepo
@@ -5,8 +5,8 @@
 ;
 [subrepo]
 	remote = https://github.com/ethteck/splat.git
-	branch = imgflip
-	commit = 4e012eaad6bffc4da7aed13d9a7f86bbfddf9150
-	parent = 8acbae1ea523f217fc6f2780ee83b37f5f2ac05b
+	branch = master
+	commit = 924414a51d0bcc52076b6ee7147b1bb1d20e804a
+	parent = 7515d21506205b43cccd28875f0d2765addb36ad
 	method = merge
 	cmdver = 0.4.3
diff --git a/tools/splat/CHANGELOG.md b/tools/splat/CHANGELOG.md
index 259410a5e8..68bd282b73 100644
--- a/tools/splat/CHANGELOG.md
+++ b/tools/splat/CHANGELOG.md
@@ -1,5 +1,11 @@
 # splat Release Notes
 
+### 0.7.2
+
+* Data disassembly changes:
+  * String detection has been improved. Please send me false positives / negatives as you see them and I can try to improve it further!
+  * Symbols in a data segment pointed to by other symbols will now properly be split out as their own symbols
+
 ### 0.7.1
 
 * Image segment changes:
diff --git a/tools/splat/segtypes/n64/data.py b/tools/splat/segtypes/n64/data.py
index 542e4c2b96..2c2a874e19 100644
--- a/tools/splat/segtypes/n64/data.py
+++ b/tools/splat/segtypes/n64/data.py
@@ -1,6 +1,6 @@
 from segtypes.n64.codesubsegment import N64SegCodeSubsegment
 from pathlib import Path
-from typing import Optional
+from typing import Dict, Optional
 from util.symbols import Symbol
 from util import floats, options
 
@@ -33,14 +33,21 @@ class N64SegData(N64SegCodeSubsegment):
     def get_linker_section(self) -> str:
         return ".data"
 
-    def get_symbols(self):
-        ret = []
+    def get_symbols(self, rom_bytes):
+        symset = set()
+
+        # Find inter-data symbols
+        for i in range(self.rom_start, self.rom_end, 4):
+            bits = int.from_bytes(rom_bytes[i : i + 4], "big")
+            if self.contains_vram(bits):
+                symset.add(self.parent.get_symbol(bits, create=True, define=True, local_only=True))
 
         for symbol_addr in self.seg_symbols:
             for symbol in self.seg_symbols[symbol_addr]:
                 if not symbol.dead and self.contains_vram(symbol.vram_start):
-                    ret.append(symbol)
+                    symset.add(symbol)
 
+        ret = list(symset)
         ret.sort(key=lambda s:s.vram_start)
 
         # Ensure we start at the beginning
@@ -52,21 +59,75 @@ class N64SegData(N64SegCodeSubsegment):
 
         return ret
 
+    def are_null(chars):
+        for b in chars:
+            if b != '\x00':
+                return False
+        return True
+
     @staticmethod
     def is_valid_ascii(bytes):
-        if len(bytes) < 8:
+        null_char = '\x00'
+        valid_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890[]():%!#=-_ "
+        invalid_chars = ""
+        duplicate_limit = 10
+
+        last_char = 0
+        true_end = None
+        consecutive_duplicates = 0
+        valid_count = 0
+
+        if len(bytes) <= 4 or bytes[0] == 0:
             return False
 
-        num_empty_bytes = 0
-        for b in bytes:
-            if b == 0:
-                num_empty_bytes += 1
-
-        empty_ratio = num_empty_bytes / len(bytes)
-        if empty_ratio > 0.2:
+        try:
+            chars = bytes.decode("EUC-JP")
+        except:
             return False
 
-        return True
+        if len(chars) <= 4:
+            return False
+
+        for i, c in enumerate(chars):
+            # Ensure null bytes are only at the end of ascii strings
+            # TODO: if we find null bytes in the middle, break this into multiple strings ?
+            if c == null_char:
+                if true_end is None:
+                    if N64SegData.are_null(chars[i:]):
+                        true_end = i
+                    else:
+                        pass
+                        #return False
+
+            # Ensure we're not seeing a ton of the same character in a row
+            if last_char == c:
+                consecutive_duplicates += 1
+                if consecutive_duplicates >= duplicate_limit and last_char != null_char:
+                    return False
+            else:
+                consecutive_duplicates = 0
+
+            if c in valid_chars:
+                valid_count += 1
+            elif c in invalid_chars:
+                return False
+
+            last_char = c
+
+        # Ensure the number of valid characters is sufficient
+        if true_end is not None:
+            # If there are more than 16 null chars at the end, something is afoot
+            if len(chars) - true_end > 16:
+                return False
+            end = true_end
+        else:
+            end = len(chars)
+
+        valid_ratio = valid_count / end
+        if valid_ratio >= 0.75:
+            return True
+
+        return False
     
     def disassemble_symbol(self, sym_bytes, sym_type):
         if sym_type == "jtbl":
@@ -86,9 +147,10 @@ class N64SegData(N64SegCodeSubsegment):
         if sym_type == "ascii":
             try:
                 ascii_str = sym_bytes.decode("EUC-JP")
-                ascii_str = ascii_str.replace("\\", "\\\\")
+                # ascii_str = ascii_str.rstrip("\x00")
                 ascii_str = ascii_str.replace("\x00", "\\0")
                 ascii_str = ascii_str.replace("\n", "\\n")
+
                 sym_str += f'"{ascii_str}"'
                 return sym_str
             except:
@@ -148,7 +210,7 @@ class N64SegData(N64SegCodeSubsegment):
         if self.size == 0:
             return None
 
-        syms = self.get_symbols()
+        syms = self.get_symbols(rom_bytes)
 
         for i in range(len(syms) - 1):
             mnemonic = syms[i].access_mnemonic
diff --git a/tools/splat/segtypes/n64/rodata.py b/tools/splat/segtypes/n64/rodata.py
index 5cd4dbaee2..64c5832724 100644
--- a/tools/splat/segtypes/n64/rodata.py
+++ b/tools/splat/segtypes/n64/rodata.py
@@ -3,16 +3,3 @@ from segtypes.n64.data import N64SegData
 class N64SegRodata(N64SegData):    
     def get_linker_section(self) -> str:
         return ".rodata"
-
-    def scan(self, rom_bytes: bytes):
-        self.file_text = self.disassemble_data(rom_bytes)
-
-    def split(self, rom_bytes: bytes):
-        if self.file_text:
-            path = self.out_path()
-            
-            if path:
-                path.parent.mkdir(parents=True, exist_ok=True)
-
-                with open(path, "w", newline="\n") as f:
-                    f.write(self.file_text)