-
-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Allow nativeparse to parse source code directly #21260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
bzoracler
wants to merge
23
commits into
python:master
Choose a base branch
from
bzoracler:nativeparse-source
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
23 commits
Select commit
Hold shift + click to select a range
2e793ae
Enable native parsing to use source directly
bzoracler 50b0860
Remove `file_exists` parameter from `mypy.parse.parse()` calls
bzoracler ac275e4
Test for invalid bytes
bzoracler 149e459
Fix omitted argument
bzoracler 47e45f7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] 96e8e07
Merge branch 'master' into nativeparse-source
bzoracler beba478
Merge branch 'master' into nativeparse-source
ilevkivskyi 99c7610
Remove fscache existence checks and parallel workarounds
bzoracler 5c45fe2
Remove handling of sequential states
bzoracler 8e53191
Don't pass dummy source
bzoracler b029c44
Refactor to allow a safer `source=None`
bzoracler d1691cc
Raise OSError in parallel mode for missing files
bzoracler 2a523b5
Temporarily fix test
bzoracler 6d72fff
Remove redundant condition
bzoracler 1a28227
Don't create duplicate parallel states
bzoracler cb1bcae
Fix missing parse call
bzoracler df34d0b
Revert "Temporarily fix test"
bzoracler 49c65b8
Revert "Refactor to allow a safer `source=None`"
bzoracler f47a898
Handle `source=None` in `parse()` function
bzoracler 8cfffa0
Check for file existence before parallel parsing
bzoracler b9cc0b5
Handle `source=None` when `--package-root` is set
bzoracler 20b035e
Simplify handling of `--package-root`
bzoracler 52422db
Inline parallel parsing
bzoracler File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1024,85 +1024,77 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: | |
| self.post_parse_all(states) | ||
| return | ||
|
|
||
| sequential_states = [] | ||
| parallel_states = [] | ||
| for state in states: | ||
| if not self.fscache.exists(state.xpath): | ||
| build_error( | ||
| "Cannot read file '{}': {}".format( | ||
| state.xpath.replace(os.getcwd() + os.sep, ""), | ||
| os.strerror(2), # `errno.ENOENT` | ||
| ) | ||
| ) | ||
| if state.tree is not None: | ||
| # The file was already parsed. | ||
| continue | ||
| if not self.fscache.exists(state.xpath, real_only=True): | ||
| # New parser only supports parsing on-disk files. | ||
| sequential_states.append(state) | ||
| state.needs_parse = False | ||
| continue | ||
| parallel_states.append(state) | ||
|
|
||
| if len(parallel_states) > 1: | ||
| self.parse_parallel(sequential_states, parallel_states) | ||
| else: | ||
| # Avoid using executor when there is no parallelism. | ||
| for state in states: | ||
| state.parse_file() | ||
| if post_parse: | ||
| self.post_parse_all(states) | ||
| # This duplicates a bit of logic from State.parse_file(). This is done to | ||
| # optimize handling of states parsed in parallel. | ||
|
|
||
| def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None: | ||
| """Perform parallel parsing of states. | ||
| parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( | ||
| parallel_states | ||
| ) | ||
|
|
||
| Note: this duplicates a bit of logic from State.parse_file(). This is done | ||
| as an optimization to parallelize only those parts of the code that can be | ||
| parallelized efficiently. | ||
| """ | ||
| parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( | ||
| sequential_states, parallel_states | ||
| ) | ||
| for state in parallel_parsed_states: | ||
| # New parser only returns serialized ASTs | ||
| with state.wrap_context(): | ||
| assert state.tree is not None | ||
| raw_data = state.tree.raw_data | ||
| if raw_data is not None: | ||
| state.source_hash = raw_data.source_hash | ||
| state.apply_inline_configuration(raw_data.mypy_comments) | ||
| state.tree = load_from_raw( | ||
| state.xpath, | ||
| state.id, | ||
| raw_data, | ||
| self.errors, | ||
| state.options, | ||
| imports_only=bool(self.workers), | ||
| ) | ||
| if self.errors.is_blockers(): | ||
| self.log("Bailing due to parse errors") | ||
| self.errors.raise_error() | ||
|
|
||
| for state in parallel_parsed_states: | ||
| # New parser returns serialized ASTs. Deserialize full trees only if not using | ||
| # parallel workers. | ||
| with state.wrap_context(): | ||
| for state in parallel_states: | ||
| assert state.tree is not None | ||
| raw_data = state.tree.raw_data | ||
| if raw_data is not None: | ||
| # Apply inline mypy config before deserialization, since | ||
| # some options (e.g. implicit_optional) affect deserialization | ||
| state.source_hash = raw_data.source_hash | ||
| state.apply_inline_configuration(raw_data.mypy_comments) | ||
| state.tree = load_from_raw( | ||
| state.xpath, | ||
| state.id, | ||
| raw_data, | ||
| self.errors, | ||
| state.options, | ||
| imports_only=bool(self.workers), | ||
| ) | ||
| if self.errors.is_blockers(): | ||
| self.log("Bailing due to parse errors") | ||
| self.errors.raise_error() | ||
|
|
||
| for state in parallel_states: | ||
| assert state.tree is not None | ||
| if state in parallel_parsed_states_set: | ||
| if state in parallel_parsed_states_set: | ||
| if state.tree.raw_data is not None: | ||
| # source_hash was already extracted above, but raw_data | ||
| # may have been preserved for workers (imports_only=True). | ||
| pass | ||
| elif state.source_hash is None: | ||
| # At least namespace packages may not have source. | ||
| state.get_source() | ||
| state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) | ||
| state.semantic_analysis_pass1() | ||
| self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) | ||
| self.modules[state.id] = state.tree | ||
| if state.tree.raw_data is not None: | ||
| # source_hash was already extracted above, but raw_data | ||
| # may have been preserved for workers (imports_only=True). | ||
| pass | ||
| elif state.source_hash is None: | ||
| # At least namespace packages may not have source. | ||
| state.get_source() | ||
| state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) | ||
| state.semantic_analysis_pass1() | ||
| self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) | ||
| self.modules[state.id] = state.tree | ||
| if state.tree.raw_data is not None: | ||
| state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT | ||
| state.check_blockers() | ||
| state.setup_errors() | ||
|
|
||
| def parse_files_threaded_raw( | ||
| self, sequential_states: list[State], parallel_states: list[State] | ||
| ) -> tuple[list[State], set[State]]: | ||
| """Parse files using a thread pool. | ||
|
|
||
| Also parse sequential states while waiting for the parallel results. | ||
| state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT | ||
| state.check_blockers() | ||
| state.setup_errors() | ||
| elif len(parallel_states) == 1: | ||
| # Avoid using executor when there is no parallelism. | ||
| parallel_states[0].parse_file() | ||
|
|
||
| if post_parse: | ||
| self.post_parse_all(states) | ||
|
|
||
| def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]: | ||
| """Parse files in parallel using a thread pool. | ||
|
|
||
| Trees from the new parser are left in raw (serialized) form. | ||
|
|
||
| Return (list, set) of states that were actually parsed (not cached). | ||
|
|
@@ -1118,25 +1110,21 @@ def parse_files_threaded_raw( | |
| # parse_file_inner() results in no visible improvement with more than 8 threads. | ||
| # TODO: reuse thread pool and/or batch small files in single submit() call. | ||
| with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor: | ||
| for state in parallel_states: | ||
| for state in states: | ||
| state.needs_parse = False | ||
| if state.id not in self.ast_cache: | ||
| self.log(f"Parsing {state.xpath} ({state.id})") | ||
| ignore_errors = state.ignore_all or state.options.ignore_errors | ||
| if ignore_errors: | ||
| self.errors.ignored_files.add(state.xpath) | ||
| futures.append(executor.submit(state.parse_file_inner, "")) | ||
| futures.append(executor.submit(state.parse_file_inner, state.source)) | ||
| parallel_parsed_states.append(state) | ||
| parallel_parsed_states_set.add(state) | ||
| else: | ||
| self.log(f"Using cached AST for {state.xpath} ({state.id})") | ||
| state.tree, state.early_errors, source_hash = self.ast_cache[state.id] | ||
| state.source_hash = source_hash | ||
|
|
||
| # Parse sequential before waiting on parallel. | ||
| for state in sequential_states: | ||
| state.parse_file() | ||
|
|
||
| for fut in wait(futures).done: | ||
| fut.result() | ||
|
|
||
|
|
@@ -1279,21 +1267,27 @@ def parse_file( | |
| self, | ||
| id: str, | ||
| path: str, | ||
| source: str, | ||
| source: str | None, | ||
| options: Options, | ||
| raw_data: FileRawData | None = None, | ||
| ) -> MypyFile: | ||
| """Parse the source of a file with the given name. | ||
|
|
||
| Raise CompileError if there is a parse error. | ||
| """ | ||
| file_exists = self.fscache.exists(path, real_only=True) | ||
|
bzoracler marked this conversation as resolved.
|
||
| t0 = time.time() | ||
| if raw_data: | ||
| # If possible, deserialize from known binary data instead of parsing from scratch. | ||
| tree = load_from_raw(path, id, raw_data, self.errors, options) | ||
| else: | ||
| tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists) | ||
| # Handle fake `__init__.py` files due to `--package-root` | ||
| if ( | ||
| (source is None) | ||
| and (os.path.dirname(path) in self.fscache.fake_package_cache) | ||
| and (os.path.basename(path) == "__init__.py") | ||
| ): | ||
| source = "" | ||
|
Comment on lines
+1283
to
+1289
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Substitutes previous handling of |
||
| tree = parse(source, path, id, self.errors, options=options) | ||
| tree._fullname = id | ||
| if self.stats_enabled: | ||
| with self.stats_lock: | ||
|
|
@@ -3179,7 +3173,7 @@ def get_source(self) -> str: | |
| else: | ||
| err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}" | ||
| raise CompileError([err], module_with_blocker=self.id) from decodeerr | ||
| elif self.path and self.manager.fscache.isdir(self.path): | ||
| elif self.path and manager.fscache.isdir(self.path): | ||
| source = "" | ||
| self.source_hash = "" | ||
| else: | ||
|
|
@@ -3192,7 +3186,7 @@ def get_source(self) -> str: | |
| self.time_spent_us += time_spent_us(t0) | ||
| return source | ||
|
|
||
| def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None: | ||
| def parse_file_inner(self, source: str | None, raw_data: FileRawData | None = None) -> None: | ||
| t0 = time_ref() | ||
| self.tree = self.manager.parse_file( | ||
| self.id, self.xpath, source, options=self.options, raw_data=raw_data | ||
|
|
@@ -3319,9 +3313,7 @@ def semantic_analysis_pass1(self) -> None: | |
| # | ||
| # TODO: This should not be considered as a semantic analysis | ||
| # pass -- it's an independent pass. | ||
| if not options.native_parser or not self.manager.fscache.exists( | ||
| self.xpath, real_only=True | ||
| ): | ||
| if not options.native_parser: | ||
| analyzer = SemanticAnalyzerPreAnalysis() | ||
| with self.wrap_context(): | ||
| analyzer.visit_file(self.tree, self.xpath, self.id, options) | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've just copied the previous contents of
def parse_parallelstraight here, as I don't thinkState.parse_file()can be refactored very simply so that parallel parsing uses the same logic, even with removing the previous sequential states handling.