Skip to content

Commit c50d340

Browse files
Luni-4marco-c
authored andcommitted
Use external software to compare metrics
The two software now used to compare metrics are written in Rust and provide a considerable speed up on large repositories.
1 parent a12ea98 commit c50d340

1 file changed

Lines changed: 13 additions & 147 deletions

File tree

check-submodule.py

Lines changed: 13 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,23 @@
1717
1818
./check-submodule.py compute-ci-metrics -p LOCAL_DIR -l TREE_SITTER_LANGUAGE
1919
20+
To compare metrics and retrieve the structural JSON of differences
21+
in addition to the files containing the minimal tests:
2022
21-
To compare metrics and retrieve minimal tests:
22-
23-
1. Install deepdiff: pip install deepdiff
23+
1. Install json-diff from here: https://github.com/Luni-4/json-diff/releases
24+
2. Install json-minimal-tests from here: https://github.com/Luni-4/json-minimal-tests/releases
2425
2526
./check-submodule.py compare-metrics -l TREE_SITTER_LANGUAGE
27+
28+
NOTE: Add the paths of the software above to the PATH environment variable!
2629
"""
2730

2831
import argparse
29-
import asyncio
30-
import json
31-
import math
3232
import pathlib
33-
import re
3433
import subprocess
3534
import sys
3635
import typing as T
3736

38-
import deepdiff
39-
4037
# The /tmp directory will be used as workdir
4138
WORKDIR = pathlib.Path("/tmp")
4239
# Suffix for the directory containing the old metrics
@@ -69,42 +66,6 @@
6966
"tree-sitter-python": ["*.py"],
7067
}
7168

72-
73-
class JsonDiff:
74-
def __init__(
75-
self,
76-
old_metrics: T.List[pathlib.Path],
77-
new_metrics: T.List[pathlib.Path],
78-
compare_dir: pathlib.Path,
79-
max_workers: int,
80-
):
81-
self.compare_dir = compare_dir
82-
self.max_workers = max_workers
83-
84-
# Max number of file paths in a sublist
85-
n = math.ceil(len(old_metrics) / max_workers)
86-
87-
# Assign a certain number of filepaths to each worker
88-
self.workers_filepaths = [
89-
zip(old_metrics[i * n : (i + 1) * n], new_metrics[i * n : (i + 1) * n])
90-
for i in range((len(old_metrics) + n - 1) // n)
91-
]
92-
93-
# Run asynchronous comparisons between json files.
94-
async def diff(self):
95-
# Save minimal tests in the chosen directory.
96-
def _worker(worker_list: T.List[pathlib.Path]):
97-
for old_filename, new_filename in worker_list:
98-
99-
# Compute minimal tests
100-
compute_minimal_tests(old_filename, new_filename, self.compare_dir)
101-
102-
# Define the max number of coroutines used to compare json files
103-
await asyncio.gather(
104-
*(_worker(worker_filepaths) for worker_filepaths in self.workers_filepaths)
105-
)
106-
107-
10869
# Run a subprocess.
10970
def run_subprocess(cmd: str, *args: T.Union[str, pathlib.Path]) -> None:
11071
subprocess.run([cmd, *args])
@@ -138,106 +99,6 @@ def run_rca(
13899
)
139100

140101

141-
# Find the difference between the two json metric files.
142-
def get_json_diff(
143-
first_file: pathlib.Path, second_file: pathlib.Path
144-
) -> T.Tuple[T.Dict[str, T.Any], T.Dict[str, T.Any]]:
145-
with open(first_file, "r") as input_file:
146-
t1 = json.load(input_file)
147-
148-
with open(second_file, "r") as input_file:
149-
t2 = json.load(input_file)
150-
151-
diff = deepdiff.DeepDiff(t1, t2, ignore_order=True)
152-
153-
return (t1, diff)
154-
155-
156-
# Save the filename and the list of code spans associated to the differences
157-
# in a dictionary.
158-
def get_metrics_diff_span(
159-
first_json: T.Dict[str, T.Any], diff: T.Dict[str, T.Any]
160-
) -> T.Dict[str, T.List[T.Tuple[int, int]]]:
161-
# Search for this pattern in the differences object
162-
prog = re.compile(r"\['spaces'\]\[\d+\]")
163-
164-
output = {"name": first_json["name"], "spaces_spans": []}
165-
166-
for value in diff["values_changed"]:
167-
val = "".join(prog.findall(value))
168-
# Subtracting one because files starts from 0
169-
start_line = eval(f'first_json{val}["start_line"]') - 1
170-
end_line = eval(f'first_json{val}["end_line"]')
171-
output["spaces_spans"].append((start_line, end_line))
172-
173-
# Print the path of the repository file containing the differences
174-
print(first_json["name"])
175-
176-
return output
177-
178-
179-
# Dump minimal tests code in an output file.
180-
def dump_minimal_tests(
181-
code_spans_object: T.Dict[str, T.List[T.Tuple[int, int]]],
182-
new_filename: pathlib.Path,
183-
compare_dir: pathlib.Path,
184-
) -> None:
185-
# Remove duplicates from the list of spans
186-
spans_list = dict.fromkeys(code_spans_object["spaces_spans"])
187-
188-
# Get filename
189-
filename = code_spans_object["name"]
190-
191-
# Read code spans from the input source code
192-
with open(filename, "r", encoding="utf-8", errors="ignore") as input_file:
193-
# Decode only utf-8 source code files
194-
lines = input_file.readlines()
195-
196-
# Write spans to output file
197-
output_path = compare_dir / new_filename.stem
198-
with open(output_path, "w") as output_file:
199-
for span in spans_list:
200-
output_file.write("Minimal test:\n")
201-
output_file.write("".join(lines[span[0] : span[1]]) + "\n")
202-
203-
204-
# Compute minimal tests.
205-
def compute_minimal_tests(
206-
old_filename: pathlib.Path, new_filename: pathlib.Path, compare_dir: pathlib.Path
207-
) -> None:
208-
# Find the difference between the two json files with the aim of
209-
# getting some minimal tests
210-
first_json, diff = get_json_diff(old_filename, new_filename)
211-
212-
# If two json files are identical, return
213-
if not diff:
214-
return
215-
216-
# Retrieve the code spans associated to the differences
217-
code_spans_object = get_metrics_diff_span(first_json, diff)
218-
219-
# Dump the minimal tests retrived from code spans on a file with the
220-
# same extension of the analyzed source code
221-
dump_minimal_tests(code_spans_object, new_filename, compare_dir)
222-
223-
224-
# Save json files of differences and minimal tests in the chosen directory
225-
# concurrently.
226-
def save_diff_files(
227-
old_dir: pathlib.Path, new_dir: pathlib.Path, compare_dir: pathlib.Path
228-
) -> None:
229-
# Get all metric files in old and new directories
230-
old_paths = sorted(pathlib.Path(old_dir).glob("*.json"))
231-
new_paths = sorted(pathlib.Path(new_dir).glob("*.json"))
232-
233-
# Create a new coroutines handler
234-
json_diff = JsonDiff(old_paths, new_paths, compare_dir, 4)
235-
236-
# Find the differences between json files and save the results in a
237-
# chosen directory asynchronously
238-
asyncio.run(json_diff.diff())
239-
240-
241102
# Compute continuous integration metrics before and after a
242103
# tree-sitter-language update.
243104
def compute_ci_metrics(args: argparse.Namespace) -> None:
@@ -342,8 +203,13 @@ def compare_metrics(args: argparse.Namespace) -> None:
342203
# Create compare directory
343204
compare_dir.mkdir(parents=True, exist_ok=True)
344205

345-
# Save files of differences and minimal tests in the chosen directory
346-
save_diff_files(old_dir, new_dir, compare_dir)
206+
# Get JSON of differences
207+
print("\nSave JSON of differences in", compare_dir)
208+
run_subprocess("json-diff-cli", "--raw-json", "-o", compare_dir, old_dir, new_dir)
209+
210+
# Get minimal tests
211+
print("\nSave minimal tests in", compare_dir)
212+
run_subprocess("json-minimal-tests", "-o", compare_dir, old_dir, new_dir)
347213

348214

349215
def main() -> None:

0 commit comments

Comments
 (0)