mirror of
https://github.com/pmret/papermario.git
synced 2024-11-08 12:02:30 +01:00
Improve find_duplicates.py
performance, ~30x (#618)
- Use length heuristic to avoid computing true Lev ratio - Prebuild `sym_bytes` map for `sym_name` -> `query_bytes`
This commit is contained in:
parent
79bcd008f9
commit
fdf0ed245b
@ -124,6 +124,11 @@ def diff_syms(qb, tb):
|
|||||||
if len(tb[1]) < 8:
|
if len(tb[1]) < 8:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
# The minimum edit distance for two strings of different lengths is `abs(l1 - l2)`
|
||||||
|
# Quickly check if it's impossible to beat the threshold. If it is, then return 0
|
||||||
|
l1, l2 = len(qb[0]), len(tb[0])
|
||||||
|
if abs(l1 - l2) / (l1 + l2) > 1.0 - args.threshold:
|
||||||
|
return 0
|
||||||
r = ratio(qb[0], tb[0])
|
r = ratio(qb[0], tb[0])
|
||||||
|
|
||||||
if r == 1.0 and qb[1] != tb[1]:
|
if r == 1.0 and qb[1] != tb[1]:
|
||||||
@ -261,36 +266,38 @@ def do_cross_query():
|
|||||||
ccount = Counter()
|
ccount = Counter()
|
||||||
clusters = []
|
clusters = []
|
||||||
|
|
||||||
|
sym_bytes = {}
|
||||||
for sym_name in map_syms:
|
for sym_name in map_syms:
|
||||||
if not sym_name.startswith("D_") and \
|
if not sym_name.startswith("D_") and \
|
||||||
not sym_name.startswith("_binary") and \
|
not sym_name.startswith("_binary") and \
|
||||||
not sym_name.startswith("jtbl_") and \
|
not sym_name.startswith("jtbl_") and \
|
||||||
not re.match(r"L[0-9A-F]{8}_[0-9A-F]{5,6}", sym_name):
|
not re.match(r"L[0-9A-F]{8}_[0-9A-F]{5,6}", sym_name):
|
||||||
if get_symbol_length(sym_name) > 16:
|
if get_symbol_length(sym_name) > 16:
|
||||||
query_bytes = get_symbol_bytes(map_offsets, sym_name)
|
sym_bytes[sym_name] = get_symbol_bytes(map_offsets, sym_name)
|
||||||
|
|
||||||
cluster_match = False
|
for sym_name, query_bytes in sym_bytes.items():
|
||||||
for cluster in clusters:
|
cluster_match = False
|
||||||
cluster_first = cluster[0]
|
for cluster in clusters:
|
||||||
cluster_score = get_pair_score(query_bytes, cluster_first)
|
cluster_first = cluster[0]
|
||||||
if cluster_score >= args.threshold:
|
cluster_score = diff_syms(query_bytes, sym_bytes[cluster_first])
|
||||||
cluster_match = True
|
if cluster_score >= args.threshold:
|
||||||
if sym_name.startswith("func") and not cluster_first.startswith("func"):
|
cluster_match = True
|
||||||
ccount[sym_name] = ccount[cluster_first]
|
if sym_name.startswith("func") and not cluster_first.startswith("func"):
|
||||||
del ccount[cluster_first]
|
ccount[sym_name] = ccount[cluster_first]
|
||||||
cluster_first = sym_name
|
del ccount[cluster_first]
|
||||||
cluster.insert(0, cluster_first)
|
cluster_first = sym_name
|
||||||
else:
|
cluster.insert(0, cluster_first)
|
||||||
cluster.append(sym_name)
|
else:
|
||||||
|
cluster.append(sym_name)
|
||||||
|
|
||||||
if cluster_first.startswith("func"):
|
if cluster_first.startswith("func"):
|
||||||
ccount[cluster_first] += 1
|
ccount[cluster_first] += 1
|
||||||
|
|
||||||
#if len(cluster) % 10 == 0 and len(cluster) >= 10:
|
#if len(cluster) % 10 == 0 and len(cluster) >= 10:
|
||||||
print(f"Cluster {cluster_first} grew to size {len(cluster)} - {sym_name}: {str(cluster_score)}")
|
print(f"Cluster {cluster_first} grew to size {len(cluster)} - {sym_name}: {str(cluster_score)}")
|
||||||
break
|
break
|
||||||
if not cluster_match:
|
if not cluster_match:
|
||||||
clusters.append([sym_name])
|
clusters.append([sym_name])
|
||||||
print(ccount.most_common(100))
|
print(ccount.most_common(100))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user