Improve find_duplicates.py performance, ~30x (#618)

- Use length heuristic to avoid computing true Lev ratio
- Prebuild `sym_bytes` map for `sym_name` -> `query_bytes`
This commit is contained in:
Zach Banks 2022-01-13 01:25:39 -05:00 committed by GitHub
parent 79bcd008f9
commit fdf0ed245b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -124,6 +124,11 @@ def diff_syms(qb, tb):
if len(tb[1]) < 8: if len(tb[1]) < 8:
return 0 return 0
# The minimum edit distance for two strings of different lengths is `abs(l1 - l2)`
# Quickly check if it's impossible to beat the threshold. If it is, then return 0
l1, l2 = len(qb[0]), len(tb[0])
if abs(l1 - l2) / (l1 + l2) > 1.0 - args.threshold:
return 0
r = ratio(qb[0], tb[0]) r = ratio(qb[0], tb[0])
if r == 1.0 and qb[1] != tb[1]: if r == 1.0 and qb[1] != tb[1]:
@ -261,36 +266,38 @@ def do_cross_query():
ccount = Counter() ccount = Counter()
clusters = [] clusters = []
sym_bytes = {}
for sym_name in map_syms: for sym_name in map_syms:
if not sym_name.startswith("D_") and \ if not sym_name.startswith("D_") and \
not sym_name.startswith("_binary") and \ not sym_name.startswith("_binary") and \
not sym_name.startswith("jtbl_") and \ not sym_name.startswith("jtbl_") and \
not re.match(r"L[0-9A-F]{8}_[0-9A-F]{5,6}", sym_name): not re.match(r"L[0-9A-F]{8}_[0-9A-F]{5,6}", sym_name):
if get_symbol_length(sym_name) > 16: if get_symbol_length(sym_name) > 16:
query_bytes = get_symbol_bytes(map_offsets, sym_name) sym_bytes[sym_name] = get_symbol_bytes(map_offsets, sym_name)
cluster_match = False for sym_name, query_bytes in sym_bytes.items():
for cluster in clusters: cluster_match = False
cluster_first = cluster[0] for cluster in clusters:
cluster_score = get_pair_score(query_bytes, cluster_first) cluster_first = cluster[0]
if cluster_score >= args.threshold: cluster_score = diff_syms(query_bytes, sym_bytes[cluster_first])
cluster_match = True if cluster_score >= args.threshold:
if sym_name.startswith("func") and not cluster_first.startswith("func"): cluster_match = True
ccount[sym_name] = ccount[cluster_first] if sym_name.startswith("func") and not cluster_first.startswith("func"):
del ccount[cluster_first] ccount[sym_name] = ccount[cluster_first]
cluster_first = sym_name del ccount[cluster_first]
cluster.insert(0, cluster_first) cluster_first = sym_name
else: cluster.insert(0, cluster_first)
cluster.append(sym_name) else:
cluster.append(sym_name)
if cluster_first.startswith("func"): if cluster_first.startswith("func"):
ccount[cluster_first] += 1 ccount[cluster_first] += 1
#if len(cluster) % 10 == 0 and len(cluster) >= 10: #if len(cluster) % 10 == 0 and len(cluster) >= 10:
print(f"Cluster {cluster_first} grew to size {len(cluster)} - {sym_name}: {str(cluster_score)}") print(f"Cluster {cluster_first} grew to size {len(cluster)} - {sym_name}: {str(cluster_score)}")
break break
if not cluster_match: if not cluster_match:
clusters.append([sym_name]) clusters.append([sym_name])
print(ccount.most_common(100)) print(ccount.most_common(100))