From 2e793ae79457065d40c9a438bdb42a875dc71a82 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Fri, 17 Apr 2026 20:58:48 +1200 Subject: [PATCH 01/21] Enable native parsing to use source directly --- mypy/nativeparse.py | 13 ++++++++++--- mypy/test/test_nativeparse.py | 24 +++++++++++++++++++----- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py index b49ab0cfaef50..5378b0b0329a0 100644 --- a/mypy/nativeparse.py +++ b/mypy/nativeparse.py @@ -189,7 +189,10 @@ def add_error( def native_parse( - filename: str, options: Options, skip_function_bodies: bool = False + filename: str, + options: Options, + source: str | bytes | None = None, + skip_function_bodies: bool = False, ) -> tuple[MypyFile, list[ParseError], TypeIgnores]: """Parse a Python file using the native Rust-based parser. @@ -218,7 +221,7 @@ def native_parse( uses_template_strings, source_hash, mypy_comments, - ) = parse_to_binary_ast(filename, options, skip_function_bodies) + ) = parse_to_binary_ast(filename, options, source, skip_function_bodies) node = MypyFile([], []) node.path = filename node.raw_data = FileRawData( @@ -255,7 +258,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]: def parse_to_binary_ast( - filename: str, options: Options, skip_function_bodies: bool = False + filename: str, + options: Options, + source: str | bytes | None = None, + skip_function_bodies: bool = False, ) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]: # This is a horrible hack to work around a mypyc bug where imported # module may be not ready in a thread sometimes. @@ -266,6 +272,7 @@ def parse_to_binary_ast( raise ImportError("Cannot import ast_serialize") ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse( filename, + source, skip_function_bodies=skip_function_bodies, python_version=options.python_version, platform=options.platform, diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index b50da5f5d02c7..74983c9e9b31f 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -234,7 +234,7 @@ def format_reachable_imports(node: MypyFile) -> list[str]: @unittest.skipUnless(has_nativeparse, "nativeparse not available") class TestNativeParserBinaryFormat(unittest.TestCase): - def test_trivial_binary_data(self) -> None: + def _assert_trivial_binary_data(self, b: bytes, /) -> None: # A quick sanity check to ensure the serialized data looks as expected. Only covers # a few AST nodes. @@ -250,9 +250,9 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> int_enc(end_column - start_column), ] - with temp_source("print('hello')") as fnam: - b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) - assert list(b) == ( + self.assertEqual( + list(b), + ( [LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR] + [nodes.NAME_EXPR, LITERAL_STR] + [int_enc(5)] @@ -269,7 +269,21 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> + [LIST_GEN, 22, LITERAL_NONE] + locs(1, 0, 1, 14) + [END_TAG, END_TAG] - ) + ), + ) + + def test_trivial_binary_data_from_file(self) -> None: + with temp_source("print('hello')") as fnam: + b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) + self._assert_trivial_binary_data(b) + + def test_trivial_binary_data_from_string_source(self) -> None: + b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), "print('hello')") + self._assert_trivial_binary_data(b) + + def test_trivial_binary_data_from_bytes_source(self) -> None: + b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), b"print('hello')") + self._assert_trivial_binary_data(b) @contextlib.contextmanager From 50b086026529eddc7111126140b5fa46feca5ce8 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Fri, 17 Apr 2026 21:02:35 +1200 Subject: [PATCH 02/21] Remove `file_exists` parameter from `mypy.parse.parse()` calls --- misc/dump-ast.py | 2 +- mypy/build.py | 3 +-- mypy/checkstrformat.py | 1 - mypy/parse.py | 35 +++++++++++++++-------------------- mypy/stubgen.py | 1 - mypy/test/testparse.py | 2 -- 6 files changed, 17 insertions(+), 27 deletions(-) diff --git a/misc/dump-ast.py b/misc/dump-ast.py index 68ea8bc0dc61e..7fdf905bae0b4 100755 --- a/misc/dump-ast.py +++ b/misc/dump-ast.py @@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No options.python_version = python_version with open(fname, "rb") as f: s = f.read() - tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True) + tree = parse(s, fname, None, errors=Errors(options), options=options) if not quiet: print(tree) diff --git a/mypy/build.py b/mypy/build.py index 5c1c6c62bffde..1b553b9ad3529 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1282,13 +1282,12 @@ def parse_file( Raise CompileError if there is a parse error. """ - file_exists = self.fscache.exists(path, real_only=True) t0 = time.time() if raw_data: # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: - tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists) + tree = parse(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: with self.stats_lock: diff --git a/mypy/checkstrformat.py b/mypy/checkstrformat.py index e96af007e29c9..aba49d71b77ec 100644 --- a/mypy/checkstrformat.py +++ b/mypy/checkstrformat.py @@ -587,7 +587,6 @@ def apply_field_accessors( module=None, options=self.chk.options, errors=temp_errors, - file_exists=False, eager=True, ) if temp_errors.is_errors(): diff --git a/mypy/parse.py b/mypy/parse.py index b0901a3a24552..3ca757ed2522c 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -17,7 +17,6 @@ def parse( module: str | None, errors: Errors, options: Options, - file_exists: bool, eager: bool = False, ) -> MypyFile: """Parse a source file, without doing any semantic analysis. @@ -29,25 +28,21 @@ def parse( the parse errors, use eager=True. """ if options.native_parser: - # Native parser only works with actual files on disk - # Fall back to fastparse for in-memory source or non-existent files - if file_exists: - import mypy.nativeparse - - ignore_errors = options.ignore_errors or fnam in errors.ignored_files - # If errors are ignored, we can drop many function bodies to speed up type checking. - strip_function_bodies = ignore_errors and not options.preserve_asts - tree, _, _ = mypy.nativeparse.native_parse( - fnam, options, skip_function_bodies=strip_function_bodies - ) - # Set is_stub based on file extension - tree.is_stub = fnam.endswith(".pyi") - # Note: tree.imports is populated directly by load_from_raw() with deserialized - # import metadata, so we don't need to collect imports via AST traversal - if eager and tree.raw_data is not None: - tree = load_from_raw(fnam, module, tree.raw_data, errors, options) - return tree - # Fall through to fastparse for non-existent files + import mypy.nativeparse + + ignore_errors = options.ignore_errors or fnam in errors.ignored_files + # If errors are ignored, we can drop many function bodies to speed up type checking. + strip_function_bodies = ignore_errors and not options.preserve_asts + tree, _, _ = mypy.nativeparse.native_parse( + fnam, options, source, skip_function_bodies=strip_function_bodies + ) + # Set is_stub based on file extension + tree.is_stub = fnam.endswith(".pyi") + # Note: tree.imports is populated directly by load_from_raw() with deserialized + # import metadata, so we don't need to collect imports via AST traversal + if eager and tree.raw_data is not None: + tree = load_from_raw(fnam, module, tree.raw_data, errors, options) + return tree if options.transform_source is not None: source = options.transform_source(source) diff --git a/mypy/stubgen.py b/mypy/stubgen.py index 9c682ba4b8201..267a694da92b4 100755 --- a/mypy/stubgen.py +++ b/mypy/stubgen.py @@ -1750,7 +1750,6 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None: module=mod.module, errors=errors, options=mypy_options, - file_exists=True, eager=True, ) mod.ast._fullname = mod.module diff --git a/mypy/test/testparse.py b/mypy/test/testparse.py index 6d00f5b5710f9..8f4de5bc7412b 100644 --- a/mypy/test/testparse.py +++ b/mypy/test/testparse.py @@ -66,7 +66,6 @@ def test_parser(testcase: DataDrivenTestCase) -> None: module="__main__", errors=errors, options=options, - file_exists=False, eager=True, ) if errors.is_errors(): @@ -108,7 +107,6 @@ def test_parse_error(testcase: DataDrivenTestCase) -> None: "__main__", errors=errors, options=options, - file_exists=False, eager=True, ) if errors.is_errors(): From ac275e47c940e94434f0e34d20febf808ec9c989 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 28 Apr 2026 12:57:20 +1200 Subject: [PATCH 03/21] Test for invalid bytes --- mypy/test/test_nativeparse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index 74983c9e9b31f..b2a9011325395 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -285,6 +285,10 @@ def test_trivial_binary_data_from_bytes_source(self) -> None: b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), b"print('hello')") self._assert_trivial_binary_data(b) + def test_invalid_bytes_raises(self) -> None: + with self.assertRaises(UnicodeDecodeError): + parse_to_binary_ast("", Options(), b"\xff") + @contextlib.contextmanager def temp_source(text: str) -> Iterator[str]: From 149e459964a8f9d792178cbb36121ef40009c6fa Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 28 Apr 2026 15:06:34 +1200 Subject: [PATCH 04/21] Fix omitted argument --- mypy/test/test_nativeparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index b2a9011325395..e0a0da29166b9 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -98,7 +98,7 @@ def test_parser(testcase: DataDrivenTestCase) -> None: try: with temp_source(source) as fnam: - node, errors, type_ignores = native_parse(fnam, options, skip_function_bodies) + node, errors, type_ignores = native_parse(fnam, options, None, skip_function_bodies) errors += load_tree(node, options) node.path = "main" a = node.str_with_options(options).split("\n") From 47e45f71906e8c644c8b010a836a6609c6ead83d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Apr 2026 03:08:58 +0000 Subject: [PATCH 05/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mypy/stubgen.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mypy/stubgen.py b/mypy/stubgen.py index 267a694da92b4..9b0089b6aec0f 100755 --- a/mypy/stubgen.py +++ b/mypy/stubgen.py @@ -1745,12 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None: source = mypy.util.decode_python_encoding(data) errors = Errors(mypy_options) mod.ast = mypy.parse.parse( - source, - fnam=mod.path, - module=mod.module, - errors=errors, - options=mypy_options, - eager=True, + source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True ) mod.ast._fullname = mod.module if errors.is_blockers(): From 99c7610e9c0affef7bbcd1729b9b4655ab0419ab Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 14:49:03 +1200 Subject: [PATCH 06/21] Remove fscache existence checks and parallel workarounds --- mypy/build.py | 8 +------- mypy/fscache.py | 5 +---- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 80613f07ba8f0..7c19d5cbc57eb 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1030,10 +1030,6 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: if state.tree is not None: # The file was already parsed. continue - if not self.fscache.exists(state.xpath, real_only=True): - # New parser only supports parsing on-disk files. - sequential_states.append(state) - continue parallel_states.append(state) if len(parallel_states) > 1: self.parse_parallel(sequential_states, parallel_states) @@ -3318,9 +3314,7 @@ def semantic_analysis_pass1(self) -> None: # # TODO: This should not be considered as a semantic analysis # pass -- it's an independent pass. - if not options.native_parser or not self.manager.fscache.exists( - self.xpath, real_only=True - ): + if not options.native_parser: analyzer = SemanticAnalyzerPreAnalysis() with self.wrap_context(): analyzer.visit_file(self.tree, self.xpath, self.id, options) diff --git a/mypy/fscache.py b/mypy/fscache.py index 75041633eb90b..63fe5368a2a9c 100644 --- a/mypy/fscache.py +++ b/mypy/fscache.py @@ -253,13 +253,10 @@ def isdir(self, path: str) -> bool: return False return stat.S_ISDIR(st.st_mode) - def exists(self, path: str, real_only: bool = False) -> bool: + def exists(self, path: str) -> bool: st = self.stat_or_none(path) if st is None: return False - if real_only: - dirname = os.path.dirname(path) - return dirname not in self.fake_package_cache return True def read(self, path: str) -> bytes: From 5c45fe2f056fb594b093dae2dacd4eca22ce7c2a Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 14:51:03 +1200 Subject: [PATCH 07/21] Remove handling of sequential states --- mypy/build.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 7c19d5cbc57eb..6f5085d388227 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1024,23 +1024,19 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: self.post_parse_all(states) return - sequential_states = [] - parallel_states = [] + parallel_states = [state for state in states if state.tree is None] for state in states: if state.tree is not None: # The file was already parsed. + state.needs_parse = False continue parallel_states.append(state) if len(parallel_states) > 1: - self.parse_parallel(sequential_states, parallel_states) - else: - # Avoid using executor when there is no parallelism. - for state in states: - state.parse_file() + self.parse_parallel(parallel_states) if post_parse: self.post_parse_all(states) - def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None: + def parse_parallel(self, parallel_states: list[State]) -> None: """Perform parallel parsing of states. Note: this duplicates a bit of logic from State.parse_file(). This is done @@ -1048,7 +1044,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S parallelized efficiently. """ parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( - sequential_states, parallel_states + parallel_states ) for state in parallel_parsed_states: @@ -1093,12 +1089,9 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S state.check_blockers() state.setup_errors() - def parse_files_threaded_raw( - self, sequential_states: list[State], parallel_states: list[State] - ) -> tuple[list[State], set[State]]: - """Parse files using a thread pool. + def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]: + """Parse files in parallel using a thread pool. - Also parse sequential states while waiting for the parallel results. Trees from the new parser are left in raw (serialized) form. Return (list, set) of states that were actually parsed (not cached). @@ -1114,7 +1107,7 @@ def parse_files_threaded_raw( # parse_file_inner() results in no visible improvement with more than 8 threads. # TODO: reuse thread pool and/or batch small files in single submit() call. with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor: - for state in parallel_states: + for state in states: state.needs_parse = False if state.id not in self.ast_cache: self.log(f"Parsing {state.xpath} ({state.id})") @@ -1129,10 +1122,6 @@ def parse_files_threaded_raw( state.tree, state.early_errors, source_hash = self.ast_cache[state.id] state.source_hash = source_hash - # Parse sequential before waiting on parallel. - for state in sequential_states: - state.parse_file() - for fut in wait(futures).done: fut.result() From 8e5319180f344ae4e55c8dd77a33fd70e1440002 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 15:38:58 +1200 Subject: [PATCH 08/21] Don't pass dummy source --- mypy/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy/build.py b/mypy/build.py index 6f5085d388227..f7c56f58c600a 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1114,7 +1114,7 @@ def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], se ignore_errors = state.ignore_all or state.options.ignore_errors if ignore_errors: self.errors.ignored_files.add(state.xpath) - futures.append(executor.submit(state.parse_file_inner, "")) + futures.append(executor.submit(state.parse_file_inner, state.source)) parallel_parsed_states.append(state) parallel_parsed_states_set.add(state) else: From b029c4420837d52af4a8b78e006758cfe4a30df2 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 15:41:48 +1200 Subject: [PATCH 09/21] Refactor to allow a safer `source=None` --- mypy/build.py | 21 +++++++++++++++------ mypy/parse.py | 41 ++++++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index f7c56f58c600a..eeb5a2fd3df63 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -163,7 +163,7 @@ from mypy.modules_state import modules_state from mypy.nodes import Expression from mypy.options import Options -from mypy.parse import load_from_raw, parse +from mypy.parse import load_from_raw, parse, parse_native from mypy.plugin import ChainedPlugin, Plugin, ReportConfigContext from mypy.plugins.default import DefaultPlugin from mypy.renaming import LimitedVariableRenameVisitor, VariableRenameVisitor @@ -1114,7 +1114,9 @@ def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], se ignore_errors = state.ignore_all or state.options.ignore_errors if ignore_errors: self.errors.ignored_files.add(state.xpath) - futures.append(executor.submit(state.parse_file_inner, state.source)) + futures.append( + executor.submit(state.parse_file_inner, state.source, parallel=True) + ) parallel_parsed_states.append(state) parallel_parsed_states_set.add(state) else: @@ -1264,9 +1266,10 @@ def parse_file( self, id: str, path: str, - source: str, + source: str | None, options: Options, raw_data: FileRawData | None = None, + parallel: bool = False, ) -> MypyFile: """Parse the source of a file with the given name. @@ -1277,7 +1280,11 @@ def parse_file( # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: - tree = parse(source, path, id, self.errors, options=options) + if source is not None: + tree = parse(source, path, id, self.errors, options=options) + else: + assert parallel + tree = parse_native(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: with self.stats_lock: @@ -3176,10 +3183,12 @@ def get_source(self) -> str: self.time_spent_us += time_spent_us(t0) return source - def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None: + def parse_file_inner( + self, source: str | None, raw_data: FileRawData | None = None, parallel: bool = False + ) -> None: t0 = time_ref() self.tree = self.manager.parse_file( - self.id, self.xpath, source, options=self.options, raw_data=raw_data + self.id, self.xpath, source, self.options, raw_data, parallel ) self.time_spent_us += time_spent_us(t0) diff --git a/mypy/parse.py b/mypy/parse.py index 3ca757ed2522c..515dd57257b9e 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -28,21 +28,7 @@ def parse( the parse errors, use eager=True. """ if options.native_parser: - import mypy.nativeparse - - ignore_errors = options.ignore_errors or fnam in errors.ignored_files - # If errors are ignored, we can drop many function bodies to speed up type checking. - strip_function_bodies = ignore_errors and not options.preserve_asts - tree, _, _ = mypy.nativeparse.native_parse( - fnam, options, source, skip_function_bodies=strip_function_bodies - ) - # Set is_stub based on file extension - tree.is_stub = fnam.endswith(".pyi") - # Note: tree.imports is populated directly by load_from_raw() with deserialized - # import metadata, so we don't need to collect imports via AST traversal - if eager and tree.raw_data is not None: - tree = load_from_raw(fnam, module, tree.raw_data, errors, options) - return tree + return parse_native(source, fnam, module, errors, options, eager) if options.transform_source is not None: source = options.transform_source(source) @@ -97,6 +83,31 @@ def load_from_raw( return tree +def parse_native( + source: str | bytes | None, + fnam: str, + module: str | None, + errors: Errors, + options: Options, + eager: bool = False, +) -> MypyFile: + import mypy.nativeparse + + ignore_errors = options.ignore_errors or fnam in errors.ignored_files + # If errors are ignored, we can drop many function bodies to speed up type checking. + strip_function_bodies = ignore_errors and not options.preserve_asts + tree, _, _ = mypy.nativeparse.native_parse( + fnam, options, source, skip_function_bodies=strip_function_bodies + ) + # Set is_stub based on file extension + tree.is_stub = fnam.endswith(".pyi") + # Note: tree.imports is populated directly by load_from_raw() with deserialized + # import metadata, so we don't need to collect imports via AST traversal + if eager and tree.raw_data is not None: + tree = load_from_raw(fnam, module, tree.raw_data, errors, options) + return tree + + def report_parse_error(error: ParseError, errors: Errors) -> None: message = error["message"] # Standardize error message by capitalizing the first word From d1691ccde9c233f51d33114edd068748c8fe90aa Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 18:32:31 +1200 Subject: [PATCH 10/21] Raise OSError in parallel mode for missing files --- mypy/build.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mypy/build.py b/mypy/build.py index eeb5a2fd3df63..68d42ab612bfe 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1284,6 +1284,13 @@ def parse_file( tree = parse(source, path, id, self.errors, options=options) else: assert parallel + if (source is None) and (not os.path.exists(path)): + build_error( + "Cannot read file '{}': {}".format( + path.replace(os.getcwd() + os.sep, ""), + os.strerror(2), # `errno.ENOENT` + ) + ) tree = parse_native(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: From 2a523b5de21ab5232116aa4f25741eeaa5e50edf Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 19:40:50 +1200 Subject: [PATCH 11/21] Temporarily fix test --- test-data/unit/cmdline.test | 1 + 1 file changed, 1 insertion(+) diff --git a/test-data/unit/cmdline.test b/test-data/unit/cmdline.test index cfba7a81e9285..04719ea81eac9 100644 --- a/test-data/unit/cmdline.test +++ b/test-data/unit/cmdline.test @@ -593,6 +593,7 @@ import d [case testPackageRootMultipleParallel] # cmd: mypy --package-root=a/ --package-root=./ a/b/c.py d.py main.py --num-workers=2 +[file a/b/__init__.py] [file a/b/c.py] [file d.py] [file main.py] From 6d72fffd89289e61024aa83aeed3dc1780cb1718 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 19:43:45 +1200 Subject: [PATCH 12/21] Remove redundant condition --- mypy/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy/build.py b/mypy/build.py index 68d42ab612bfe..0501399a0c59c 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1284,7 +1284,7 @@ def parse_file( tree = parse(source, path, id, self.errors, options=options) else: assert parallel - if (source is None) and (not os.path.exists(path)): + if not os.path.exists(path): build_error( "Cannot read file '{}': {}".format( path.replace(os.getcwd() + os.sep, ""), From 1a28227b86cfe8534d1717e9a8015f95f2cacf6c Mon Sep 17 00:00:00 2001 From: bzoracler Date: Mon, 18 May 2026 20:24:45 +1200 Subject: [PATCH 13/21] Don't create duplicate parallel states --- mypy/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy/build.py b/mypy/build.py index 0501399a0c59c..be2ab5d9d709c 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1024,7 +1024,7 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: self.post_parse_all(states) return - parallel_states = [state for state in states if state.tree is None] + parallel_states = [] for state in states: if state.tree is not None: # The file was already parsed. From cb1bcae4bb54a2666a7386a3556a68779628038b Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 07:48:58 +1200 Subject: [PATCH 14/21] Fix missing parse call --- mypy/build.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mypy/build.py b/mypy/build.py index be2ab5d9d709c..e9062bdebe98c 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1033,6 +1033,9 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: parallel_states.append(state) if len(parallel_states) > 1: self.parse_parallel(parallel_states) + elif len(parallel_states) == 1: + # Avoid using executor when there is no parallelism. + parallel_states[0].parse_file() if post_parse: self.post_parse_all(states) From df34d0b146b2b6806bccb1e264e959de3ba47e0d Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 11:39:20 +1200 Subject: [PATCH 15/21] Revert "Temporarily fix test" This reverts commit 2a523b5de21ab5232116aa4f25741eeaa5e50edf. --- test-data/unit/cmdline.test | 1 - 1 file changed, 1 deletion(-) diff --git a/test-data/unit/cmdline.test b/test-data/unit/cmdline.test index 04719ea81eac9..cfba7a81e9285 100644 --- a/test-data/unit/cmdline.test +++ b/test-data/unit/cmdline.test @@ -593,7 +593,6 @@ import d [case testPackageRootMultipleParallel] # cmd: mypy --package-root=a/ --package-root=./ a/b/c.py d.py main.py --num-workers=2 -[file a/b/__init__.py] [file a/b/c.py] [file d.py] [file main.py] From 49c65b83b3c08f04f2492cee8966588b9cbbf959 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 11:41:59 +1200 Subject: [PATCH 16/21] Revert "Refactor to allow a safer `source=None`" --- mypy/build.py | 26 +++++--------------------- mypy/parse.py | 41 +++++++++++++++-------------------------- 2 files changed, 20 insertions(+), 47 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index e9062bdebe98c..dd96fbc7b872b 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -163,7 +163,7 @@ from mypy.modules_state import modules_state from mypy.nodes import Expression from mypy.options import Options -from mypy.parse import load_from_raw, parse, parse_native +from mypy.parse import load_from_raw, parse from mypy.plugin import ChainedPlugin, Plugin, ReportConfigContext from mypy.plugins.default import DefaultPlugin from mypy.renaming import LimitedVariableRenameVisitor, VariableRenameVisitor @@ -1117,9 +1117,7 @@ def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], se ignore_errors = state.ignore_all or state.options.ignore_errors if ignore_errors: self.errors.ignored_files.add(state.xpath) - futures.append( - executor.submit(state.parse_file_inner, state.source, parallel=True) - ) + futures.append(executor.submit(state.parse_file_inner, state.source)) parallel_parsed_states.append(state) parallel_parsed_states_set.add(state) else: @@ -1272,7 +1270,6 @@ def parse_file( source: str | None, options: Options, raw_data: FileRawData | None = None, - parallel: bool = False, ) -> MypyFile: """Parse the source of a file with the given name. @@ -1283,18 +1280,7 @@ def parse_file( # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: - if source is not None: - tree = parse(source, path, id, self.errors, options=options) - else: - assert parallel - if not os.path.exists(path): - build_error( - "Cannot read file '{}': {}".format( - path.replace(os.getcwd() + os.sep, ""), - os.strerror(2), # `errno.ENOENT` - ) - ) - tree = parse_native(source, path, id, self.errors, options=options) + tree = parse(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: with self.stats_lock: @@ -3193,12 +3179,10 @@ def get_source(self) -> str: self.time_spent_us += time_spent_us(t0) return source - def parse_file_inner( - self, source: str | None, raw_data: FileRawData | None = None, parallel: bool = False - ) -> None: + def parse_file_inner(self, source: str | None, raw_data: FileRawData | None = None) -> None: t0 = time_ref() self.tree = self.manager.parse_file( - self.id, self.xpath, source, self.options, raw_data, parallel + self.id, self.xpath, source, options=self.options, raw_data=raw_data ) self.time_spent_us += time_spent_us(t0) diff --git a/mypy/parse.py b/mypy/parse.py index 515dd57257b9e..3ca757ed2522c 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -28,7 +28,21 @@ def parse( the parse errors, use eager=True. """ if options.native_parser: - return parse_native(source, fnam, module, errors, options, eager) + import mypy.nativeparse + + ignore_errors = options.ignore_errors or fnam in errors.ignored_files + # If errors are ignored, we can drop many function bodies to speed up type checking. + strip_function_bodies = ignore_errors and not options.preserve_asts + tree, _, _ = mypy.nativeparse.native_parse( + fnam, options, source, skip_function_bodies=strip_function_bodies + ) + # Set is_stub based on file extension + tree.is_stub = fnam.endswith(".pyi") + # Note: tree.imports is populated directly by load_from_raw() with deserialized + # import metadata, so we don't need to collect imports via AST traversal + if eager and tree.raw_data is not None: + tree = load_from_raw(fnam, module, tree.raw_data, errors, options) + return tree if options.transform_source is not None: source = options.transform_source(source) @@ -83,31 +97,6 @@ def load_from_raw( return tree -def parse_native( - source: str | bytes | None, - fnam: str, - module: str | None, - errors: Errors, - options: Options, - eager: bool = False, -) -> MypyFile: - import mypy.nativeparse - - ignore_errors = options.ignore_errors or fnam in errors.ignored_files - # If errors are ignored, we can drop many function bodies to speed up type checking. - strip_function_bodies = ignore_errors and not options.preserve_asts - tree, _, _ = mypy.nativeparse.native_parse( - fnam, options, source, skip_function_bodies=strip_function_bodies - ) - # Set is_stub based on file extension - tree.is_stub = fnam.endswith(".pyi") - # Note: tree.imports is populated directly by load_from_raw() with deserialized - # import metadata, so we don't need to collect imports via AST traversal - if eager and tree.raw_data is not None: - tree = load_from_raw(fnam, module, tree.raw_data, errors, options) - return tree - - def report_parse_error(error: ParseError, errors: Errors) -> None: message = error["message"] # Standardize error message by capitalizing the first word From f47a8988474cedbd43e8a2e48e5c81b9bb03ba34 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 12:42:56 +1200 Subject: [PATCH 17/21] Handle `source=None` in `parse()` function --- mypy/parse.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mypy/parse.py b/mypy/parse.py index 3ca757ed2522c..a8fb5542a7049 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -12,7 +12,7 @@ def parse( - source: str | bytes, + source: str | bytes | None, fnam: str, module: str | None, errors: Errors, @@ -26,6 +26,9 @@ def parse( New parser returns empty tree with serialized data. To get the full tree and the parse errors, use eager=True. + + `source` must not be `None` if the old parser is used. The new parser will read and + parse contents from path `fnam` if `source` is `None`. """ if options.native_parser: import mypy.nativeparse @@ -44,6 +47,8 @@ def parse( tree = load_from_raw(fnam, module, tree.raw_data, errors, options) return tree + if source is None: + raise ValueError("Source cannot be `None` when using the old parser") if options.transform_source is not None: source = options.transform_source(source) import mypy.fastparse From 8cfffa008486bda96233f02121f49e60c0a04069 Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 12:44:12 +1200 Subject: [PATCH 18/21] Check for file existence before parallel parsing --- mypy/build.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mypy/build.py b/mypy/build.py index dd96fbc7b872b..871541137aa9f 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1026,6 +1026,13 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: parallel_states = [] for state in states: + if not self.fscache.exists(state.xpath): + build_error( + "Cannot read file '{}': {}".format( + state.xpath.replace(os.getcwd() + os.sep, ""), + os.strerror(2), # `errno.ENOENT` + ) + ) if state.tree is not None: # The file was already parsed. state.needs_parse = False From b9cc0b5ea1151dac3063978ab38c6e0dce36325c Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 14:04:25 +1200 Subject: [PATCH 19/21] Handle `source=None` when `--package-root` is set --- mypy/build.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 871541137aa9f..88e5c946752e2 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -2729,7 +2729,11 @@ def new_state( meta, meta_ex = meta_pair interface_hash = meta.interface_hash meta_source_hash = meta.hash - if path and source is None and manager.fscache.isdir(path): + if ( + path + and source is None + and (manager.fscache.isdir(path) or manager.fscache.init_under_package_root(path)) + ): source = "" if manager.stats_enabled: @@ -3173,7 +3177,10 @@ def get_source(self) -> str: else: err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}" raise CompileError([err], module_with_blocker=self.id) from decodeerr - elif self.path and self.manager.fscache.isdir(self.path): + elif self.path and ( + manager.fscache.isdir(self.path) + or manager.fscache.init_under_package_root(self.path) + ): source = "" self.source_hash = "" else: From 20b035ef44aed365ac4950718e5c8148805584cc Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 16:35:33 +1200 Subject: [PATCH 20/21] Simplify handling of `--package-root` --- mypy/build.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 88e5c946752e2..1f5c9f1049429 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1287,6 +1287,13 @@ def parse_file( # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: + # Handle fake `__init__.py` files due to `--package-root` + if ( + (source is None) + and (os.path.dirname(path) in self.fscache.fake_package_cache) + and (os.path.basename(path) == "__init__.py") + ): + source = "" tree = parse(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: @@ -2729,11 +2736,7 @@ def new_state( meta, meta_ex = meta_pair interface_hash = meta.interface_hash meta_source_hash = meta.hash - if ( - path - and source is None - and (manager.fscache.isdir(path) or manager.fscache.init_under_package_root(path)) - ): + if path and source is None and manager.fscache.isdir(path): source = "" if manager.stats_enabled: @@ -3177,10 +3180,7 @@ def get_source(self) -> str: else: err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}" raise CompileError([err], module_with_blocker=self.id) from decodeerr - elif self.path and ( - manager.fscache.isdir(self.path) - or manager.fscache.init_under_package_root(self.path) - ): + elif self.path and manager.fscache.isdir(self.path): source = "" self.source_hash = "" else: From 52422db9178514373a15b974f360da00cf6267bd Mon Sep 17 00:00:00 2001 From: bzoracler Date: Tue, 19 May 2026 16:51:02 +1200 Subject: [PATCH 21/21] Inline parallel parsing --- mypy/build.py | 101 +++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 1f5c9f1049429..8bea6f645a14f 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1038,66 +1038,59 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: state.needs_parse = False continue parallel_states.append(state) + if len(parallel_states) > 1: - self.parse_parallel(parallel_states) - elif len(parallel_states) == 1: - # Avoid using executor when there is no parallelism. - parallel_states[0].parse_file() - if post_parse: - self.post_parse_all(states) + # This duplicates a bit of logic from State.parse_file(). This is done to + # optimize handling of states parsed in parallel. - def parse_parallel(self, parallel_states: list[State]) -> None: - """Perform parallel parsing of states. + parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( + parallel_states + ) - Note: this duplicates a bit of logic from State.parse_file(). This is done - as an optimization to parallelize only those parts of the code that can be - parallelized efficiently. - """ - parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( - parallel_states - ) + for state in parallel_parsed_states: + # New parser only returns serialized ASTs + with state.wrap_context(): + assert state.tree is not None + raw_data = state.tree.raw_data + if raw_data is not None: + state.source_hash = raw_data.source_hash + state.apply_inline_configuration(raw_data.mypy_comments) + state.tree = load_from_raw( + state.xpath, + state.id, + raw_data, + self.errors, + state.options, + imports_only=bool(self.workers), + ) + if self.errors.is_blockers(): + self.log("Bailing due to parse errors") + self.errors.raise_error() - for state in parallel_parsed_states: - # New parser returns serialized ASTs. Deserialize full trees only if not using - # parallel workers. - with state.wrap_context(): + for state in parallel_states: assert state.tree is not None - raw_data = state.tree.raw_data - if raw_data is not None: - # Apply inline mypy config before deserialization, since - # some options (e.g. implicit_optional) affect deserialization - state.source_hash = raw_data.source_hash - state.apply_inline_configuration(raw_data.mypy_comments) - state.tree = load_from_raw( - state.xpath, - state.id, - raw_data, - self.errors, - state.options, - imports_only=bool(self.workers), - ) - if self.errors.is_blockers(): - self.log("Bailing due to parse errors") - self.errors.raise_error() - - for state in parallel_states: - assert state.tree is not None - if state in parallel_parsed_states_set: + if state in parallel_parsed_states_set: + if state.tree.raw_data is not None: + # source_hash was already extracted above, but raw_data + # may have been preserved for workers (imports_only=True). + pass + elif state.source_hash is None: + # At least namespace packages may not have source. + state.get_source() + state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) + state.semantic_analysis_pass1() + self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) + self.modules[state.id] = state.tree if state.tree.raw_data is not None: - # source_hash was already extracted above, but raw_data - # may have been preserved for workers (imports_only=True). - pass - elif state.source_hash is None: - # At least namespace packages may not have source. - state.get_source() - state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) - state.semantic_analysis_pass1() - self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) - self.modules[state.id] = state.tree - if state.tree.raw_data is not None: - state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT - state.check_blockers() - state.setup_errors() + state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT + state.check_blockers() + state.setup_errors() + elif len(parallel_states) == 1: + # Avoid using executor when there is no parallelism. + parallel_states[0].parse_file() + + if post_parse: + self.post_parse_all(states) def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]: """Parse files in parallel using a thread pool.