pytorch
diff --git a/‎test/dynamo/test_aot_autograd_cache.py
Copy file name to clipboardExpand all lines: test/dynamo/test_aot_autograd_cache.py
+123-6Lines changed: 123 additions & 6 deletions b/‎test/dynamo/test_aot_autograd_cache.py
Copy file name to clipboardExpand all lines: test/dynamo/test_aot_autograd_cache.py
+123-6Lines changed: 123 additions & 6 deletions
diff --git a/‎torch/_functorch/_aot_autograd/autograd_cache.py
Copy file name to clipboardExpand all lines: torch/_functorch/_aot_autograd/autograd_cache.py
+3Lines changed: 3 additions & 0 deletions b/‎torch/_functorch/_aot_autograd/autograd_cache.py
Copy file name to clipboardExpand all lines: torch/_functorch/_aot_autograd/autograd_cache.py
+3Lines changed: 3 additions & 0 deletions
@@ -292,6 +292,114 @@ def fn(a, b):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
 
+    @largeTensorTest("64GB", device=GPU_TYPE)
+    @parametrize("device", (GPU_TYPE,))
+    @parametrize("dtype", (torch.float16, torch.bfloat16))
+    @inductor_config.patch("fx_graph_cache", True)
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_autograd_guard_single_entry(self, device, dtype):
+        """
+        Test caching the same graph, but under conditions that introduce guards
+        for tensor sizes < int32. See test_codecache::TestFxGraphCache::test_cache_load_with_guards_int32_bounds.
+
+        This test in particular tests the behavior of a single entry cache. If we ever make AOTAutogradCache
+        support multiple entries under the same key, this test should be updated.
+        """
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
+            raise unittest.SkipTest("requires CUDA SM80 or later")
+
+        def fn(x, y):
+            return (x + x, y + y)
+
+        def expect_miss(compiled_fn, a, b):
+            self._clear_dynamo_and_codecache()
+            counters.clear()
+            res = compiled_fn(a, b)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_guard_miss"],
+                0,
+            )
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            return res
+
+        def expect_hit(compiled_fn, a, b):
+            self._clear_dynamo_and_codecache()
+            counters.clear()
+            res = compiled_fn(a, b)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_guard_miss"],
+                0,
+            )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_hit"],
+                1,
+            )
+            return res
+
+        def expect_guard_miss(compiled_fn, a, b):
+            self._clear_dynamo_and_codecache()
+            counters.clear()
+            res = compiled_fn(a, b)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_guard_miss"],
+                1,
+            )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_hit"],
+                0,
+            )
+            return res
+
+        compiled_fn = torch.compile(fn, dynamic=True)
+
+        a_shape = (5, 6)
+        b_shape = (7, 8)
+        a = torch.rand(a_shape, device=device, dtype=dtype)
+        b = torch.rand(b_shape, device=device, dtype=dtype)
+        res1 = expect_miss(compiled_fn, a, b)
+
+        # Same shape, should cache hit
+        a2 = a.detach().clone()
+        b2 = b.detach().clone()
+
+        res2 = expect_hit(compiled_fn, a2, b2)
+
+        self.assertEqual(res1, res2)
+
+        # By changing the shape greatly, despite the same exact input
+        # graph, inductor should report a guard miss, leading
+        # to a cache miss on our end.
+        a_shape = (5, 6)
+        b_shape = (47000, 47001)
+        a3 = torch.rand(a_shape, device=device, dtype=dtype)
+        b3 = torch.rand(b_shape, device=device, dtype=dtype)
+
+        expect_guard_miss(compiled_fn, a3, b3)
+
+        # Wobble the shape a bit, but not enough
+        # to trigger a guard miss (since 6, 7 is still less than int32)
+        # Should result in a cache hit
+        a_shape = (6, 7)
+        b_shape = (47000, 47001)
+        a4 = torch.rand(a_shape, device=device, dtype=dtype)
+        b4 = torch.rand(b_shape, device=device, dtype=dtype)
+        expect_hit(compiled_fn, a4, b4)
+
+        # Change the shape back to the original,
+        # FXGraphCache should hit because it stores
+        # multiple entries
+        a_shape = (5, 6)
+        b_shape = (7, 8)
+        a5 = torch.rand(a_shape, device=device, dtype=dtype)
+        b5 = torch.rand(b_shape, device=device, dtype=dtype)
+        expect_hit(compiled_fn, a5, b5)
+
     @largeTensorTest("64GB", device=GPU_TYPE)
     @parametrize("device", (GPU_TYPE,))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
@@ -301,7 +409,9 @@ def fn(a, b):
     @functorch_config.patch({"enable_autograd_cache": True})
     def test_autograd_inductor_guards(self, device, dtype, requires_grad):
         """
-        Tests that functions that would add inductor guards are cached properly
+        Test caching the same graph, but under conditions that introduce guards
+        for tensor sizes < int32.
+        See test_codecache::TestFxGraphCache::test_cache_load_with_guards_int32_bounds.
         """
         if device == GPU_TYPE and not HAS_GPU:
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
@@ -323,6 +433,7 @@ def fn(x, y):
             ((47000, 47001), (5, 6)),
         )
         expected_hits = expected_misses = expected_saves = 0
+        expected_guard_misses = 0
         for a_shape, b_shape in shapes:
             a = torch.rand(
                 a_shape, device=device, dtype=dtype, requires_grad=requires_grad
@@ -336,15 +447,15 @@ def fn(x, y):
             # see a recompilation (along with a cache miss).
             res1 = compiled_fn(a, b)
             # A first call should miss in the cache.
-            # NOTE: Currently, this cache miss is *not* due to guards,
-            # but instead because the AOTAutogradCache key calculation specializes on input shapes.
-            # Once we allow tensors with symints as part of the cache key calculation, it will
-            # instead cache miss because of guard failure.
             expected_misses += 1
-
             self.assertEqual(
                 counters["aot_autograd"]["autograd_cache_miss"], expected_misses
             )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_guard_miss"],
+                expected_guard_misses,
+            )
+
             self.assertEqual(
                 counters["aot_autograd"]["autograd_cache_hit"], expected_hits
             )
@@ -375,6 +486,12 @@ def fn(x, y):
             self.assertEqual(
                 counters["aot_autograd"]["autograd_cache_miss"], expected_misses
             )
+            self.assertEqual(
+                counters["aot_autograd"]["autograd_cache_guard_miss"],
+                expected_guard_misses,
+            )
+            # First compile is a regular cache miss, subsequent are guard misses
+            expected_guard_misses += 1
             self.assertEqual(
                 counters["aot_autograd"]["autograd_cache_hit"], expected_hits
             )
 
@@ -473,6 +473,9 @@ def load(
         # Count missing the FXGraphCache as a miss not a bypass
         except FXGraphCacheMiss as e:
             counters["aot_autograd"]["autograd_cache_miss"] += 1
+            # Special counter when we pass autograd cache but
+            # fail when on inductor guards
+            counters["aot_autograd"]["autograd_cache_guard_miss"] += 1
             if config.strict_autograd_cache:
                 raise e
         except BypassAOTAutogradCache as e: