1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[find_interesting_reviews.py] Add git blame output cache

The majority of the running time of this script tends to be spent in
running git blame on source files touched by patches under review.

By introducing a git blame output cache, some of the git blame commands
don't have to re-run, and the blame information can be retrieved from a
cache.

I've observed that in a typical run matching patches available for
review with potential reviewers, this speeds up the script's running
time by a factor of about 2.5x.
This commit is contained in:
Kristof Beyls 2019-12-23 12:01:47 +00:00
parent a6c442cb15
commit 32e21be127

View File

@ -458,11 +458,11 @@ def get_git_cmd_output(cmd):
reAuthorMail = re.compile("^author-mail <([^>]*)>.*$") reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
def parse_blame_output_line_porcelain(blame_output): def parse_blame_output_line_porcelain(blame_output_lines):
email2nr_occurences = {} email2nr_occurences = {}
if blame_output is None: if blame_output_lines is None:
return email2nr_occurences return email2nr_occurences
for line in blame_output.split('\n'): for line in blame_output_lines:
m = reAuthorMail.match(line) m = reAuthorMail.match(line)
if m: if m:
author_email_address = m.group(1) author_email_address = m.group(1)
@ -473,6 +473,54 @@ def parse_blame_output_line_porcelain(blame_output):
return email2nr_occurences return email2nr_occurences
class BlameOutputCache:
def __init__(self):
self.cache = {}
def _populate_cache_for(self, cache_key):
assert cache_key not in self.cache
git_repo, base_revision, path = cache_key
cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
"--line-porcelain {1} -- {2}").format(git_repo, base_revision,
path)
blame_output = get_git_cmd_output(cmd)
self.cache[cache_key] = \
blame_output.split('\n') if blame_output is not None else None
# FIXME: the blame cache could probably be made more effective still if
# instead of storing the requested base_revision in the cache, the last
# revision before the base revision this file/path got changed in gets
# stored. That way multiple project revisions for which this specific
# file/patch hasn't changed would get cache hits (instead of misses in
# the current implementation).
def get_blame_output_for(self, git_repo, base_revision, path, start_line=-1,
end_line=-1):
cache_key = (git_repo, base_revision, path)
if cache_key not in self.cache:
self._populate_cache_for(cache_key)
assert cache_key in self.cache
all_blame_lines = self.cache[cache_key]
if all_blame_lines is None:
return None
if start_line == -1 and end_line == -1:
return all_blame_lines
assert start_line >= 0
assert end_line >= 0
assert end_line <= len(all_blame_lines)
assert start_line <= len(all_blame_lines)
assert start_line <= end_line
return all_blame_lines[start_line:end_line]
def get_parsed_git_blame_for(self, git_repo, base_revision, path,
start_line=-1, end_line=-1):
return parse_blame_output_line_porcelain(
self.get_blame_output_for(git_repo, base_revision, path, start_line,
end_line))
blameOutputCache = BlameOutputCache()
def find_reviewers_for_diff_heuristic(diff): def find_reviewers_for_diff_heuristic(diff):
# Heuristic 1: assume good reviewers are the ones that touched the same # Heuristic 1: assume good reviewers are the ones that touched the same
# lines before as this patch is touching. # lines before as this patch is touching.
@ -496,23 +544,18 @@ def find_reviewers_for_diff_heuristic(diff):
for hunk in change.hunks: for hunk in change.hunks:
for start_line, end_line in hunk.actual_lines_changed_offset: for start_line, end_line in hunk.actual_lines_changed_offset:
# Collect git blame results for authors in those ranges. # Collect git blame results for authors in those ranges.
cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
"-w --line-porcelain -L {1},{2} {3} -- {4}").format(
git_repo, start_line, end_line, base_revision, path)
blame_output = get_git_cmd_output(cmd)
for reviewer, nr_occurences in \ for reviewer, nr_occurences in \
parse_blame_output_line_porcelain(blame_output).items(): blameOutputCache.get_parsed_git_blame_for(
git_repo, base_revision, path, start_line, end_line
).items():
if reviewer not in reviewers2nr_lines_touched: if reviewer not in reviewers2nr_lines_touched:
reviewers2nr_lines_touched[reviewer] = 0 reviewers2nr_lines_touched[reviewer] = 0
reviewers2nr_lines_touched[reviewer] += nr_occurences reviewers2nr_lines_touched[reviewer] += nr_occurences
# Compute heuristic 2: don't look at context, just at files touched. # Compute heuristic 2: don't look at context, just at files touched.
# Collect git blame results for authors in those ranges. # Collect git blame results for authors in those ranges.
cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " + for reviewer, nr_occurences in \
"--line-porcelain {1} -- {2}").format(git_repo, base_revision, blameOutputCache.get_parsed_git_blame_for(
path) git_repo, base_revision, path).items():
blame_output = get_git_cmd_output(cmd)
for reviewer, nr_occurences in parse_blame_output_line_porcelain(
blame_output).items():
if reviewer not in reviewers2nr_files_touched: if reviewer not in reviewers2nr_files_touched:
reviewers2nr_files_touched[reviewer] = 0 reviewers2nr_files_touched[reviewer] = 0
reviewers2nr_files_touched[reviewer] += 1 reviewers2nr_files_touched[reviewer] += 1