Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 04bc7f6

Browse filesBrowse files
committed
FindModuleCache: leverage BuildSourceSet
On large codebases, the time in `load_graph` is dominated by `find_module` because this operation is itself `O(n)` where `n` is the number of input files, which ends up being `O(n**2)` because it is called for every import statement in the codebase. Introduce a fast path that leverages the fact that for imports within the code being typechecked, we already have a mapping of module import path to file path in `BuildSourceSet` In a real-world codebase with ~13k files split across dozens of disjoint folders, this brings `load_graph` from ~180s down to ~48s, with profiling showing that `parse` is now taking the vast majority of the time, as expected.
1 parent 835b427 commit 04bc7f6
Copy full SHA for 04bc7f6

File tree

Expand file treeCollapse file tree

2 files changed

+100
-32
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+100
-32
lines changed

‎mypy/build.py

Copy file name to clipboardExpand all lines: mypy/build.py
+3-29Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from mypy import moduleinfo
4444
from mypy.fixup import fixup_module
4545
from mypy.modulefinder import (
46-
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
46+
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
4747
ModuleNotFoundReason
4848
)
4949
from mypy.nodes import Expression
@@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
106106
self.errors = [] # type: List[str] # Filled in by build if desired
107107

108108

109-
class BuildSourceSet:
110-
"""Efficiently test a file's membership in the set of build sources."""
111-
112-
def __init__(self, sources: List[BuildSource]) -> None:
113-
self.source_text_present = False
114-
self.source_modules = set() # type: Set[str]
115-
self.source_paths = set() # type: Set[str]
116-
117-
for source in sources:
118-
if source.text is not None:
119-
self.source_text_present = True
120-
elif source.path:
121-
self.source_paths.add(source.path)
122-
else:
123-
self.source_modules.add(source.module)
124-
125-
def is_source(self, file: MypyFile) -> bool:
126-
if file.path and file.path in self.source_paths:
127-
return True
128-
elif file._fullname in self.source_modules:
129-
return True
130-
elif self.source_text_present:
131-
return True
132-
else:
133-
return False
134-
135-
136109
def build(sources: List[BuildSource],
137110
options: Options,
138111
alt_lib_path: Optional[str] = None,
@@ -621,7 +594,8 @@ def __init__(self, data_dir: str,
621594
or options.use_fine_grained_cache)
622595
and not has_reporters)
623596
self.fscache = fscache
624-
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
597+
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
598+
source_set=self.source_set)
625599
self.metastore = create_metastore(options)
626600

627601
# a mapping from source files to their corresponding shadow files

‎mypy/modulefinder.py

Copy file name to clipboardExpand all lines: mypy/modulefinder.py
+97-3Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from mypy.defaults import PYTHON3_VERSION_MIN
1818
from mypy.fscache import FileSystemCache
19+
from mypy.nodes import MypyFile
1920
from mypy.options import Options
2021
from mypy import sitepkgs
2122

@@ -92,6 +93,33 @@ def __repr__(self) -> str:
9293
self.base_dir)
9394

9495

96+
class BuildSourceSet:
97+
"""Efficiently test a file's membership in the set of build sources."""
98+
99+
def __init__(self, sources: List[BuildSource]) -> None:
100+
self.source_text_present = False
101+
self.source_modules = {} # type: Dict[str, str]
102+
self.source_paths = set() # type: Set[str]
103+
104+
for source in sources:
105+
if source.text is not None:
106+
self.source_text_present = True
107+
if source.path:
108+
self.source_paths.add(source.path)
109+
if source.module:
110+
self.source_modules[source.module] = source.path or ''
111+
112+
def is_source(self, file: MypyFile) -> bool:
113+
if file.path and file.path in self.source_paths:
114+
return True
115+
elif file._fullname in self.source_modules:
116+
return True
117+
elif self.source_text_present:
118+
return True
119+
else:
120+
return False
121+
122+
95123
class FindModuleCache:
96124
"""Module finder with integrated cache.
97125
@@ -107,8 +135,10 @@ def __init__(self,
107135
search_paths: SearchPaths,
108136
fscache: Optional[FileSystemCache] = None,
109137
options: Optional[Options] = None,
110-
ns_packages: Optional[List[str]] = None) -> None:
138+
ns_packages: Optional[List[str]] = None,
139+
source_set: Optional[BuildSourceSet] = None) -> None:
111140
self.search_paths = search_paths
141+
self.source_set = source_set
112142
self.fscache = fscache or FileSystemCache()
113143
# Cache for get_toplevel_possibilities:
114144
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -124,6 +154,39 @@ def clear(self) -> None:
124154
self.initial_components.clear()
125155
self.ns_ancestors.clear()
126156

157+
def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
158+
if not self.source_set:
159+
return None
160+
p = self.source_set.source_modules.get(id, None)
161+
if p and self.fscache.isfile(p):
162+
# NB: need to make sure we still have __init__.py all the way up
163+
# otherwise we might have false positives compared to slow path
164+
d = os.path.dirname(p)
165+
for i in range(id.count('.')):
166+
if not self.fscache.isfile(os.path.join(d, '__init__.py')):
167+
return None
168+
d = os.path.dirname(d)
169+
return p
170+
171+
idx = id.rfind('.')
172+
if idx != - 1:
173+
parent = self.find_module_via_source_set(id[:idx])
174+
if (
175+
parent and isinstance(parent, str)
176+
and not parent.endswith('__init__.py')
177+
and not self.fscache.isdir(os.path.splitext(parent)[0])
178+
):
179+
# if
180+
# 1. we're looking for foo.bar.baz
181+
# 2. foo.bar.py[i] is in the source set
182+
# 3. foo.bar is not a directory
183+
# then we don't want to go spelunking in other search paths to find
184+
# another 'bar' module, because it's a waste of time and even in the
185+
# unlikely event that we did find one that matched, it probably would
186+
# be completely unrelated and undesirable
187+
return ModuleNotFoundReason.NOT_FOUND
188+
return None
189+
127190
def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
128191
"""Find which elements of a lib_path have the directory a module needs to exist.
129192
@@ -209,8 +272,8 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
209272
"""
210273
working_dir = os.getcwd()
211274
parent_search = FindModuleCache(SearchPaths((), (), (), ()))
212-
while any(file.endswith(("__init__.py", "__init__.pyi"))
213-
for file in os.listdir(working_dir)):
275+
while any(os.path.exists(os.path.join(working_dir, f))
276+
for f in ["__init__.py", "__init__.pyi"]):
214277
working_dir = os.path.dirname(working_dir)
215278
parent_search.search_paths = SearchPaths((working_dir,), (), (), ())
216279
if not isinstance(parent_search._find_module(id), ModuleNotFoundReason):
@@ -220,6 +283,37 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
220283
def _find_module(self, id: str) -> ModuleSearchResult:
221284
fscache = self.fscache
222285

286+
# fast path for any modules in the current source set
287+
# this is particularly important when there are a large number of search
288+
# paths which share the first (few) component(s) due to the use of namespace
289+
# packages, for instance
290+
# foo/
291+
# company/
292+
# __init__.py
293+
# foo/
294+
# bar/
295+
# company/
296+
# __init__.py
297+
# bar/
298+
# baz/
299+
# company/
300+
# __init__.py
301+
# baz/
302+
#
303+
# mypy gets [foo/company/foo, foo/company/bar, foo/company/baz, ...] as input
304+
# and computes [foo, bar, baz, ...] as the module search path
305+
#
306+
# This would result in O(n) search for every import of company.* and since,
307+
# leading to O(n**2) behavior in load_graph as such imports are unsurprisingly
308+
# present at least once, and usually many more times than that, in each and
309+
# every file being parsed
310+
#
311+
# Thankfully, such cases are efficiently handled by looking up the module path
312+
# via BuildSourceSet
313+
p = self.find_module_via_source_set(id)
314+
if p:
315+
return p
316+
223317
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
224318
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
225319
# that only once and cache it for when we look for modules like 'foo.bar.blah'

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.