Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2e793ae
Enable native parsing to use source directly
bzoracler Apr 17, 2026
50b0860
Remove `file_exists` parameter from `mypy.parse.parse()` calls
bzoracler Apr 17, 2026
ac275e4
Test for invalid bytes
bzoracler Apr 28, 2026
149e459
Fix omitted argument
bzoracler Apr 28, 2026
47e45f7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 28, 2026
96e8e07
Merge branch 'master' into nativeparse-source
bzoracler May 15, 2026
beba478
Merge branch 'master' into nativeparse-source
ilevkivskyi May 17, 2026
99c7610
Remove fscache existence checks and parallel workarounds
bzoracler May 18, 2026
5c45fe2
Remove handling of sequential states
bzoracler May 18, 2026
8e53191
Don't pass dummy source
bzoracler May 18, 2026
b029c44
Refactor to allow a safer `source=None`
bzoracler May 18, 2026
d1691cc
Raise OSError in parallel mode for missing files
bzoracler May 18, 2026
2a523b5
Temporarily fix test
bzoracler May 18, 2026
6d72fff
Remove redundant condition
bzoracler May 18, 2026
1a28227
Don't create duplicate parallel states
bzoracler May 18, 2026
cb1bcae
Fix missing parse call
bzoracler May 18, 2026
df34d0b
Revert "Temporarily fix test"
bzoracler May 18, 2026
49c65b8
Revert "Refactor to allow a safer `source=None`"
bzoracler May 18, 2026
f47a898
Handle `source=None` in `parse()` function
bzoracler May 19, 2026
8cfffa0
Check for file existence before parallel parsing
bzoracler May 19, 2026
b9cc0b5
Handle `source=None` when `--package-root` is set
bzoracler May 19, 2026
20b035e
Simplify handling of `--package-root`
bzoracler May 19, 2026
52422db
Inline parallel parsing
bzoracler May 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion misc/dump-ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No
options.python_version = python_version
with open(fname, "rb") as f:
s = f.read()
tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True)
tree = parse(s, fname, None, errors=Errors(options), options=options)
if not quiet:
print(tree)

Expand Down
154 changes: 73 additions & 81 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,85 +1024,77 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None:
self.post_parse_all(states)
return

sequential_states = []
parallel_states = []
for state in states:
if not self.fscache.exists(state.xpath):
build_error(
"Cannot read file '{}': {}".format(
state.xpath.replace(os.getcwd() + os.sep, ""),
os.strerror(2), # `errno.ENOENT`
)
)
if state.tree is not None:
# The file was already parsed.
continue
if not self.fscache.exists(state.xpath, real_only=True):
# New parser only supports parsing on-disk files.
sequential_states.append(state)
state.needs_parse = False
continue
parallel_states.append(state)

if len(parallel_states) > 1:
self.parse_parallel(sequential_states, parallel_states)
else:
# Avoid using executor when there is no parallelism.
for state in states:
state.parse_file()
if post_parse:
self.post_parse_all(states)
# This duplicates a bit of logic from State.parse_file(). This is done to
# optimize handling of states parsed in parallel.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've just copied the previous contents of def parse_parallel straight here, as I don't think State.parse_file() can be refactored very simply so that parallel parsing uses the same logic, even with removing the previous sequential states handling.


def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None:
"""Perform parallel parsing of states.
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
parallel_states
)

Note: this duplicates a bit of logic from State.parse_file(). This is done
as an optimization to parallelize only those parts of the code that can be
parallelized efficiently.
"""
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
sequential_states, parallel_states
)
for state in parallel_parsed_states:
# New parser only returns serialized ASTs
with state.wrap_context():
assert state.tree is not None
raw_data = state.tree.raw_data
if raw_data is not None:
state.source_hash = raw_data.source_hash
state.apply_inline_configuration(raw_data.mypy_comments)
state.tree = load_from_raw(
state.xpath,
state.id,
raw_data,
self.errors,
state.options,
imports_only=bool(self.workers),
)
if self.errors.is_blockers():
self.log("Bailing due to parse errors")
self.errors.raise_error()

for state in parallel_parsed_states:
# New parser returns serialized ASTs. Deserialize full trees only if not using
# parallel workers.
with state.wrap_context():
for state in parallel_states:
assert state.tree is not None
raw_data = state.tree.raw_data
if raw_data is not None:
# Apply inline mypy config before deserialization, since
# some options (e.g. implicit_optional) affect deserialization
state.source_hash = raw_data.source_hash
state.apply_inline_configuration(raw_data.mypy_comments)
state.tree = load_from_raw(
state.xpath,
state.id,
raw_data,
self.errors,
state.options,
imports_only=bool(self.workers),
)
if self.errors.is_blockers():
self.log("Bailing due to parse errors")
self.errors.raise_error()

for state in parallel_states:
assert state.tree is not None
if state in parallel_parsed_states_set:
if state in parallel_parsed_states_set:
if state.tree.raw_data is not None:
# source_hash was already extracted above, but raw_data
# may have been preserved for workers (imports_only=True).
pass
elif state.source_hash is None:
# At least namespace packages may not have source.
state.get_source()
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
state.semantic_analysis_pass1()
self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
self.modules[state.id] = state.tree
if state.tree.raw_data is not None:
# source_hash was already extracted above, but raw_data
# may have been preserved for workers (imports_only=True).
pass
elif state.source_hash is None:
# At least namespace packages may not have source.
state.get_source()
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
state.semantic_analysis_pass1()
self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
self.modules[state.id] = state.tree
if state.tree.raw_data is not None:
state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT
state.check_blockers()
state.setup_errors()

def parse_files_threaded_raw(
self, sequential_states: list[State], parallel_states: list[State]
) -> tuple[list[State], set[State]]:
"""Parse files using a thread pool.

Also parse sequential states while waiting for the parallel results.
state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT
state.check_blockers()
state.setup_errors()
elif len(parallel_states) == 1:
# Avoid using executor when there is no parallelism.
parallel_states[0].parse_file()

if post_parse:
self.post_parse_all(states)

def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]:
"""Parse files in parallel using a thread pool.

Trees from the new parser are left in raw (serialized) form.

Return (list, set) of states that were actually parsed (not cached).
Expand All @@ -1118,25 +1110,21 @@ def parse_files_threaded_raw(
# parse_file_inner() results in no visible improvement with more than 8 threads.
# TODO: reuse thread pool and/or batch small files in single submit() call.
with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
for state in parallel_states:
for state in states:
state.needs_parse = False
if state.id not in self.ast_cache:
self.log(f"Parsing {state.xpath} ({state.id})")
ignore_errors = state.ignore_all or state.options.ignore_errors
if ignore_errors:
self.errors.ignored_files.add(state.xpath)
futures.append(executor.submit(state.parse_file_inner, ""))
futures.append(executor.submit(state.parse_file_inner, state.source))
parallel_parsed_states.append(state)
parallel_parsed_states_set.add(state)
else:
self.log(f"Using cached AST for {state.xpath} ({state.id})")
state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
state.source_hash = source_hash

# Parse sequential before waiting on parallel.
for state in sequential_states:
state.parse_file()

for fut in wait(futures).done:
fut.result()

Expand Down Expand Up @@ -1279,21 +1267,27 @@ def parse_file(
self,
id: str,
path: str,
source: str,
source: str | None,
options: Options,
raw_data: FileRawData | None = None,
) -> MypyFile:
"""Parse the source of a file with the given name.

Raise CompileError if there is a parse error.
"""
file_exists = self.fscache.exists(path, real_only=True)
Comment thread
bzoracler marked this conversation as resolved.
t0 = time.time()
if raw_data:
# If possible, deserialize from known binary data instead of parsing from scratch.
tree = load_from_raw(path, id, raw_data, self.errors, options)
else:
tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists)
# Handle fake `__init__.py` files due to `--package-root`
if (
(source is None)
and (os.path.dirname(path) in self.fscache.fake_package_cache)
and (os.path.basename(path) == "__init__.py")
):
source = ""
Comment on lines +1283 to +1289
Copy link
Copy Markdown
Contributor Author

@bzoracler bzoracler May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Substitutes previous handling of file_exists = self.fscache.exists(path, real_only=True) in the same method.

tree = parse(source, path, id, self.errors, options=options)
tree._fullname = id
if self.stats_enabled:
with self.stats_lock:
Expand Down Expand Up @@ -3179,7 +3173,7 @@ def get_source(self) -> str:
else:
err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}"
raise CompileError([err], module_with_blocker=self.id) from decodeerr
elif self.path and self.manager.fscache.isdir(self.path):
elif self.path and manager.fscache.isdir(self.path):
source = ""
self.source_hash = ""
else:
Expand All @@ -3192,7 +3186,7 @@ def get_source(self) -> str:
self.time_spent_us += time_spent_us(t0)
return source

def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None:
def parse_file_inner(self, source: str | None, raw_data: FileRawData | None = None) -> None:
t0 = time_ref()
self.tree = self.manager.parse_file(
self.id, self.xpath, source, options=self.options, raw_data=raw_data
Expand Down Expand Up @@ -3319,9 +3313,7 @@ def semantic_analysis_pass1(self) -> None:
#
# TODO: This should not be considered as a semantic analysis
# pass -- it's an independent pass.
if not options.native_parser or not self.manager.fscache.exists(
self.xpath, real_only=True
):
if not options.native_parser:
analyzer = SemanticAnalyzerPreAnalysis()
with self.wrap_context():
analyzer.visit_file(self.tree, self.xpath, self.id, options)
Expand Down
1 change: 0 additions & 1 deletion mypy/checkstrformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,6 @@ def apply_field_accessors(
module=None,
options=self.chk.options,
errors=temp_errors,
file_exists=False,
eager=True,
)
if temp_errors.is_errors():
Expand Down
5 changes: 1 addition & 4 deletions mypy/fscache.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,13 +253,10 @@ def isdir(self, path: str) -> bool:
return False
return stat.S_ISDIR(st.st_mode)

def exists(self, path: str, real_only: bool = False) -> bool:
def exists(self, path: str) -> bool:
st = self.stat_or_none(path)
if st is None:
return False
if real_only:
dirname = os.path.dirname(path)
return dirname not in self.fake_package_cache
return True

def read(self, path: str) -> bytes:
Expand Down
13 changes: 10 additions & 3 deletions mypy/nativeparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ def add_error(


def native_parse(
filename: str, options: Options, skip_function_bodies: bool = False
filename: str,
options: Options,
source: str | bytes | None = None,
skip_function_bodies: bool = False,
) -> tuple[MypyFile, list[ParseError], TypeIgnores]:
"""Parse a Python file using the native Rust-based parser.

Expand Down Expand Up @@ -211,7 +214,7 @@ def native_parse(
uses_template_strings,
source_hash,
mypy_comments,
) = parse_to_binary_ast(filename, options, skip_function_bodies)
) = parse_to_binary_ast(filename, options, source, skip_function_bodies)
node = MypyFile([], [])
node.path = filename
node.raw_data = FileRawData(
Expand Down Expand Up @@ -248,7 +251,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:


def parse_to_binary_ast(
filename: str, options: Options, skip_function_bodies: bool = False
filename: str,
options: Options,
source: str | bytes | None = None,
skip_function_bodies: bool = False,
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
# This is a horrible hack to work around a mypyc bug where imported
# module may be not ready in a thread sometimes.
Expand All @@ -259,6 +265,7 @@ def parse_to_binary_ast(
raise ImportError("Cannot import ast_serialize")
ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse(
filename,
source,
skip_function_bodies=skip_function_bodies,
python_version=options.python_version,
platform=options.platform,
Expand Down
44 changes: 22 additions & 22 deletions mypy/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@


def parse(
source: str | bytes,
source: str | bytes | None,
fnam: str,
module: str | None,
errors: Errors,
options: Options,
file_exists: bool,
eager: bool = False,
) -> MypyFile:
"""Parse a source file, without doing any semantic analysis.
Expand All @@ -27,28 +26,29 @@ def parse(

New parser returns empty tree with serialized data. To get the full tree and
the parse errors, use eager=True.

`source` must not be `None` if the old parser is used. The new parser will read and
parse contents from path `fnam` if `source` is `None`.
"""
if options.native_parser:
# Native parser only works with actual files on disk
# Fall back to fastparse for in-memory source or non-existent files
if file_exists:
import mypy.nativeparse

ignore_errors = options.ignore_errors or fnam in errors.ignored_files
# If errors are ignored, we can drop many function bodies to speed up type checking.
strip_function_bodies = ignore_errors and not options.preserve_asts
tree, _, _ = mypy.nativeparse.native_parse(
fnam, options, skip_function_bodies=strip_function_bodies
)
# Set is_stub based on file extension
tree.is_stub = fnam.endswith(".pyi")
# Note: tree.imports is populated directly by load_from_raw() with deserialized
# import metadata, so we don't need to collect imports via AST traversal
if eager and tree.raw_data is not None:
tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
return tree
# Fall through to fastparse for non-existent files

import mypy.nativeparse

ignore_errors = options.ignore_errors or fnam in errors.ignored_files
# If errors are ignored, we can drop many function bodies to speed up type checking.
strip_function_bodies = ignore_errors and not options.preserve_asts
tree, _, _ = mypy.nativeparse.native_parse(
fnam, options, source, skip_function_bodies=strip_function_bodies
)
# Set is_stub based on file extension
tree.is_stub = fnam.endswith(".pyi")
# Note: tree.imports is populated directly by load_from_raw() with deserialized
# import metadata, so we don't need to collect imports via AST traversal
if eager and tree.raw_data is not None:
tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
return tree

if source is None:
raise ValueError("Source cannot be `None` when using the old parser")
if options.transform_source is not None:
source = options.transform_source(source)
import mypy.fastparse
Expand Down
8 changes: 1 addition & 7 deletions mypy/stubgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1745,13 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None:
source = mypy.util.decode_python_encoding(data)
errors = Errors(mypy_options)
mod.ast = mypy.parse.parse(
source,
fnam=mod.path,
module=mod.module,
errors=errors,
options=mypy_options,
file_exists=True,
eager=True,
source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True
)
mod.ast._fullname = mod.module
if errors.is_blockers():
Expand Down
Loading
Loading