Skip to content

Commit 15f1287

Browse files
authored
Use parallel parsing at all stages (#21266)
This is a follow-up to #21175 and #21119. Right now we are only using parallel parsing during initial graph loading. This PR allows using it when processing SCCs that are stale due to dependencies (these are not parsed on initial loading). Implementation is a bit less trivial than I thought, because we need to completely extract the tree de-serialization (which is not parallelizeable). It is still relatively straightforward. The new logic is like this: * When calling `parse(eager=False)` (default) we always return a serialized tree. * When calling `parse(eager=True)` a caller can force immediate de-serialization (when suitable), this will trigger `load_from_raw()`. * To get more manual control (e.g. for `imports_only=True`) one can use `parse(eager=False)` followed by `load_from_raw()` directly.
1 parent f8127c8 commit 15f1287

8 files changed

Lines changed: 146 additions & 105 deletions

File tree

mypy/build.py

Lines changed: 64 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@
121121
ImportFrom,
122122
MypyFile,
123123
OverloadedFuncDef,
124-
ParseError,
125124
SymbolTable,
126125
)
127126
from mypy.options import OPTIONS_AFFECTING_CACHE_NO_PLATFORM
@@ -168,7 +167,7 @@
168167
from mypy.modules_state import modules_state
169168
from mypy.nodes import Expression
170169
from mypy.options import Options
171-
from mypy.parse import load_from_raw, parse, report_parse_error
170+
from mypy.parse import load_from_raw, parse
172171
from mypy.plugin import ChainedPlugin, Plugin, ReportConfigContext
173172
from mypy.plugins.default import DefaultPlugin
174173
from mypy.renaming import LimitedVariableRenameVisitor, VariableRenameVisitor
@@ -999,13 +998,18 @@ def dump_stats(self) -> None:
999998
# Call print once so that we don't get a mess in parallel mode.
1000999
print("\n".join(lines) + "\n\n", end="")
10011000

1002-
def parse_all(self, states: list[State]) -> None:
1003-
"""Parse multiple files in parallel (if possible) and compute dependencies."""
1001+
def parse_all(self, states: list[State], post_parse: bool = True) -> None:
1002+
"""Parse multiple files in parallel (if possible) and compute dependencies.
1003+
1004+
If post_parse is False, skip the last step (used when parsing unchanged files
1005+
that need to be re-checked due to stale dependencies).
1006+
"""
10041007
if not self.options.native_parser:
10051008
# Old parser cannot be parallelized.
10061009
for state in states:
10071010
state.parse_file()
1008-
self.post_parse_all(states)
1011+
if post_parse:
1012+
self.post_parse_all(states)
10091013
return
10101014

10111015
sequential_states = []
@@ -1019,8 +1023,14 @@ def parse_all(self, states: list[State]) -> None:
10191023
sequential_states.append(state)
10201024
continue
10211025
parallel_states.append(state)
1022-
self.parse_parallel(sequential_states, parallel_states)
1023-
self.post_parse_all(states)
1026+
if len(parallel_states) > 1:
1027+
self.parse_parallel(sequential_states, parallel_states)
1028+
else:
1029+
# Avoid using executor when there is no parallelism.
1030+
for state in states:
1031+
state.parse_file()
1032+
if post_parse:
1033+
self.post_parse_all(states)
10241034

10251035
def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None:
10261036
"""Perform parallel parsing of states.
@@ -1030,7 +1040,10 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
10301040
parallelized efficiently.
10311041
"""
10321042
futures = []
1033-
parallel_parsed_states = {}
1043+
# Use both list and a set to have more predictable order of errors,
1044+
# while also not sacrificing performance.
1045+
parallel_parsed_states = []
1046+
parallel_parsed_states_set = set()
10341047
# Use at least --num-workers if specified by user.
10351048
available_threads = max(get_available_threads(), self.options.num_workers)
10361049
# Overhead from trying to parallelize (small) blocking portion of
@@ -1048,7 +1061,8 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
10481061
if ignore_errors:
10491062
self.errors.ignored_files.add(state.xpath)
10501063
futures.append(executor.submit(state.parse_file_inner, state.source or ""))
1051-
parallel_parsed_states[state.id] = state
1064+
parallel_parsed_states.append(state)
1065+
parallel_parsed_states_set.add(state)
10521066
else:
10531067
self.log(f"Using cached AST for {state.xpath} ({state.id})")
10541068
state.tree, state.early_errors = self.ast_cache[state.id]
@@ -1058,21 +1072,27 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
10581072
state.parse_file()
10591073

10601074
for fut in wait(futures).done:
1061-
state_id, parse_errors = fut.result()
1062-
# New parser reports errors lazily, add them if any.
1063-
if parse_errors:
1064-
state = parallel_parsed_states[state_id]
1065-
with state.wrap_context():
1066-
self.errors.set_file(state.xpath, state.id, options=state.options)
1067-
for error in parse_errors:
1068-
report_parse_error(error, self.errors)
1069-
if self.errors.is_blockers():
1070-
self.log("Bailing due to parse errors")
1071-
self.errors.raise_error()
1075+
fut.result()
1076+
for state in parallel_parsed_states:
1077+
# New parser returns serialized trees that need to be de-serialized.
1078+
with state.wrap_context():
1079+
assert state.tree is not None
1080+
if state.tree.raw_data:
1081+
state.tree = load_from_raw(
1082+
state.xpath,
1083+
state.id,
1084+
state.tree.raw_data,
1085+
self.errors,
1086+
state.options,
1087+
imports_only=bool(self.workers),
1088+
)
1089+
if self.errors.is_blockers():
1090+
self.log("Bailing due to parse errors")
1091+
self.errors.raise_error()
10721092

10731093
for state in parallel_states:
10741094
assert state.tree is not None
1075-
if state.id in parallel_parsed_states:
1095+
if state in parallel_parsed_states_set:
10761096
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
10771097
state.semantic_analysis_pass1()
10781098
self.ast_cache[state.id] = (state.tree, state.early_errors)
@@ -1208,31 +1228,18 @@ def parse_file(
12081228
source: str,
12091229
options: Options,
12101230
raw_data: FileRawData | None = None,
1211-
) -> tuple[MypyFile, list[ParseError]]:
1231+
) -> MypyFile:
12121232
"""Parse the source of a file with the given name.
12131233
12141234
Raise CompileError if there is a parse error.
12151235
"""
1216-
imports_only = False
12171236
file_exists = self.fscache.exists(path)
1218-
if self.workers and file_exists:
1219-
# Currently, we can use the native parser only for actual files.
1220-
imports_only = True
12211237
t0 = time.time()
1222-
parse_errors: list[ParseError] = []
12231238
if raw_data:
12241239
# If possible, deserialize from known binary data instead of parsing from scratch.
12251240
tree = load_from_raw(path, id, raw_data, self.errors, options)
12261241
else:
1227-
tree, parse_errors = parse(
1228-
source,
1229-
path,
1230-
id,
1231-
self.errors,
1232-
options=options,
1233-
file_exists=file_exists,
1234-
imports_only=imports_only,
1235-
)
1242+
tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists)
12361243
tree._fullname = id
12371244
if self.stats_enabled:
12381245
with self.stats_lock:
@@ -1242,7 +1249,7 @@ def parse_file(
12421249
stubs_parsed=int(tree.is_stub),
12431250
parse_time=time.time() - t0,
12441251
)
1245-
return tree, parse_errors
1252+
return tree
12461253

12471254
def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]:
12481255
t0 = time.time()
@@ -3089,15 +3096,12 @@ def get_source(self) -> str:
30893096
self.time_spent_us += time_spent_us(t0)
30903097
return source
30913098

3092-
def parse_file_inner(
3093-
self, source: str, raw_data: FileRawData | None = None
3094-
) -> tuple[str, list[ParseError]]:
3099+
def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None:
30953100
t0 = time_ref()
3096-
self.tree, parse_errors = self.manager.parse_file(
3101+
self.tree = self.manager.parse_file(
30973102
self.id, self.xpath, source, options=self.options, raw_data=raw_data
30983103
)
30993104
self.time_spent_us += time_spent_us(t0)
3100-
return self.id, parse_errors
31013105

31023106
def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = None) -> None:
31033107
"""Parse file and run first pass of semantic analysis.
@@ -3106,7 +3110,8 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
31063110
modules in any way. Logic here should be kept in sync with BuildManager.parse_all().
31073111
"""
31083112
self.needs_parse = False
3109-
if self.tree is not None:
3113+
tree = self.tree
3114+
if tree is not None:
31103115
# The file was already parsed.
31113116
return
31123117

@@ -3120,10 +3125,19 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
31203125
self.manager.errors.ignored_files.add(self.xpath)
31213126
with self.wrap_context():
31223127
manager.errors.set_file(self.xpath, self.id, options=self.options)
3123-
_, parse_errors = self.parse_file_inner(source, raw_data)
3124-
for error in parse_errors:
3125-
# New parser reports errors lazily.
3126-
report_parse_error(error, manager.errors)
3128+
self.parse_file_inner(source, raw_data)
3129+
assert self.tree is not None
3130+
# New parser returns serialized trees that need to be de-serialized.
3131+
if self.tree.raw_data is not None:
3132+
assert raw_data is None
3133+
self.tree = load_from_raw(
3134+
self.xpath,
3135+
self.id,
3136+
self.tree.raw_data,
3137+
manager.errors,
3138+
self.options,
3139+
imports_only=bool(self.manager.workers),
3140+
)
31273141
if manager.errors.is_blockers():
31283142
manager.log("Bailing due to parse errors")
31293143
manager.errors.raise_error()
@@ -4631,9 +4645,9 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
46314645
# Re-generate import errors in case this module was loaded from the cache.
46324646
if graph[id].meta:
46334647
graph[id].verify_dependencies(suppressed_only=True)
4634-
# We may already have parsed the module, or not.
4635-
# If the former, parse_file() is a no-op.
4636-
graph[id].parse_file()
4648+
# We may already have parsed the modules, or not.
4649+
# If the former, parse_file() is a no-op.
4650+
manager.parse_all([graph[id] for id in stale], post_parse=False)
46374651
if "typing" in scc:
46384652
# For historical reasons we need to manually add typing aliases
46394653
# for built-in generic collections, see docstring of

mypy/build_worker/worker.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,14 +239,23 @@ def load_states(
239239
mod_data: dict[str, tuple[bytes, FileRawData | None]],
240240
) -> None:
241241
"""Re-create full state of an SCC as it would have been in coordinator."""
242+
needs_parse = []
242243
for id in scc.mod_ids:
243244
state = graph[id]
244245
# Re-clone options since we don't send them, it is usually faster than deserializing.
245246
state.options = state.options.clone_for_module(state.id)
246247
suppressed_deps_opts, raw_data = mod_data[id]
247-
state.parse_file(raw_data=raw_data)
248+
if raw_data is not None:
249+
state.parse_file(raw_data=raw_data)
250+
else:
251+
needs_parse.append(state)
248252
# Set data that is needed to be written to cache meta.
249253
state.known_suppressed_deps_opts = suppressed_deps_opts
254+
# Perform actual parsing in parallel (but we don't need to compute dependencies).
255+
if needs_parse:
256+
manager.parse_all(needs_parse, post_parse=False)
257+
for id in scc.mod_ids:
258+
state = graph[id]
250259
assert state.tree is not None
251260
import_lines = {imp.line for imp in state.tree.imports}
252261
state.imports_ignored = {

mypy/checkstrformat.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -581,13 +581,14 @@ def apply_field_accessors(
581581

582582
temp_errors = Errors(self.chk.options)
583583
dummy = DUMMY_FIELD_NAME + spec.field[len(spec.key) :]
584-
temp_ast, _ = parse(
584+
temp_ast = parse(
585585
dummy,
586586
fnam="<format>",
587587
module=None,
588588
options=self.chk.options,
589589
errors=temp_errors,
590590
file_exists=False,
591+
eager=True,
591592
)
592593
if temp_errors.is_errors():
593594
self.msg.fail(

mypy/nativeparse.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,15 @@ def add_error(
190190

191191

192192
def native_parse(
193-
filename: str, options: Options, skip_function_bodies: bool = False, imports_only: bool = False
193+
filename: str, options: Options, skip_function_bodies: bool = False
194194
) -> tuple[MypyFile, list[ParseError], TypeIgnores]:
195195
"""Parse a Python file using the native Rust-based parser.
196196
197197
Return (MypyFile, errors, type_ignores).
198198
199+
The returned tree is empty with actual serialized data stored in `raw_data`
200+
attribute. Use read_statements() and/or deserialize_imports() to de-serialize.
201+
199202
The caller should set these additional attributes on the returned MypyFile:
200203
- ignored_lines: dict of type ignore comments (from the TypeIgnores return value)
201204
- is_stub: whether the file is a .pyi stub
@@ -210,26 +213,12 @@ def native_parse(
210213
b, errors, ignores, import_bytes, is_partial_package, uses_template_strings = (
211214
parse_to_binary_ast(filename, options, skip_function_bodies)
212215
)
213-
data = ReadBuffer(b)
214-
n = read_int(data)
215-
state = State(options)
216-
if imports_only:
217-
defs = []
218-
else:
219-
defs = read_statements(state, data, n)
220-
221-
imports = deserialize_imports(import_bytes)
222-
223-
node = MypyFile(defs, imports)
216+
node = MypyFile([], [])
224217
node.path = filename
225-
node.is_partial_stub_package = is_partial_package
226-
if imports_only:
227-
node.raw_data = FileRawData(
228-
b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings
229-
)
230-
node.uses_template_strings = uses_template_strings
231-
all_errors = errors + state.errors
232-
return node, all_errors, ignores
218+
node.raw_data = FileRawData(
219+
b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings
220+
)
221+
return node, errors, ignores
233222

234223

235224
def expect_end_tag(data: ReadBuffer) -> None:

0 commit comments

Comments
 (0)