pytorch · kwen2501 · Feb 2, 2024 · Feb 2, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
@@ -254,16 +254,19 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
  void SetUp() override {
    // Enable LOG(INFO) messages.
    c10::initLogging();
-    size_t numDevices = cudaNumDevices();
+    // Need to have this check for at SetUp to make sure we only run the test --
+    // including the init -- when there are GPUs available.
+    if (skipTest()) {
+      GTEST_SKIP() << "Skipping ProcessGroupNCCLErrorsTest because system "
+                   << "requirement is not met (no CUDA or GPU).";
+    }
+
+    size_t numDevices = 1; // One device per rank (thread)
    TemporaryFile file;
    store_ = c10::make_intrusive<::c10d::FileStore>(file.path, 1);

-    at::cuda::OptionalCUDAGuard deviceGuard;
    tensors_.resize(numDevices);
-    for (const auto i : c10::irange(numDevices)) {
-      deviceGuard.set_index(i);
-      tensors_[i] = at::ones({3, 3}, at::kCUDA);
-    }
+    tensors_[0] = at::empty({3, 3}, at::kCUDA);
  }

  void TearDown() override {
@@ -275,18 +278,13 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
 };

 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
-  if (skipTest()) {
-    return;
-  }
-
  ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
  auto options = c10d::ProcessGroupNCCL::Options::create();
  options->timeout = std::chrono::milliseconds(1000);
  ProcessGroupNCCLSimulateErrors pg(store_, 0, 1, options);

  auto work = pg.allreduce(tensors_);
  work->wait();
-  EXPECT_TRUE(work->isSuccess());
  EXPECT_EQ(1, pg.getNCCLCommCacheSize());

  // Now run all reduce with errors.
@@ -296,25 +294,19 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {

  // Verify the work item failed.
  EXPECT_TRUE(work->isCompleted());
-  EXPECT_FALSE(work->isSuccess());
  EXPECT_THROW(work->wait(), std::runtime_error);

  // Communicators might be aborted here, further operations would fail.
 }

 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
-  if (skipTest()) {
-    return;
-  }
-
  ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
  auto options = c10d::ProcessGroupNCCL::Options::create();
  options->timeout = std::chrono::milliseconds(3000);
  ProcessGroupNCCLTimedOutErrors pg(store_, 0, 1, options);

  auto work = pg.allreduce(tensors_);
  work->wait();
-  EXPECT_TRUE(work->isSuccess());
  EXPECT_EQ(1, pg.getNCCLCommCacheSize());

  // Now run all reduce with errors.
@@ -326,17 +318,12 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
 }

 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
-  if (skipTest()) {
-    return;
-  }
-
  auto options = c10d::ProcessGroupNCCL::Options::create();
  options->timeout = std::chrono::milliseconds(3000);
  ProcessGroupNCCLSimulateErrors pg(store_, 0, 1, options);

  auto work = pg.allreduce(tensors_);
  pg.barrier()->wait();
-  EXPECT_TRUE(work->isSuccess());
  EXPECT_EQ(1, pg.getNCCLCommCacheSize());

  // Now run all reduce with errors.
@@ -347,10 +334,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
  work->wait();
  pg.barrier()->wait();

-  // Verify the work item failed.
  EXPECT_TRUE(work->isCompleted());
-  EXPECT_FALSE(work->isSuccess());
-
  // Communicators might be aborted here, further operations would fail.
 }

@@ -388,10 +372,6 @@ class TestDebugInfoWriter : public c10d::DebugInfoWriter {
 };

 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
-  if (skipTest()) {
-    return;
-  }
-
  int heartBeatIntervalInSec = 2;
  std::string timeInterval = std::to_string(heartBeatIntervalInSec);
  ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
@@ -426,7 +406,6 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
  // Normal collective case.
  auto work = pg.allreduce(tensors_);
  work->wait();
-  EXPECT_TRUE(work->isSuccess());

  work = pg.allreduce(tensors_);
  {
@@ -440,7 +419,6 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
    EXPECT_TRUE(pg.getErrorCaughtFlag());
  }
  work->wait();
-  EXPECT_TRUE(work->isSuccess());
  EXPECT_TRUE(traces.size() > 0);
  auto filename = c10::str(tempFilename, 0);
  auto traceFromStorage = readTraceFromFile(filename, traces.size());
@@ -492,12 +470,6 @@ class ProcessGroupNCCLWatchdogTimeoutTest : public ProcessGroupNCCLErrorsTest {
 };

 TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoFinished) {
-  // Need to have this check for every test to make sure we only run the test
-  // when there are GPUs available.
-  if (skipTest()) {
-    return;
-  }
-
  ProcessGroupNCCLNoHeartbeatCaught pg(store_, 0, 1, options_);
  // Write debug info will lead to watchdog thread to wait for 30 seconds.
  // And this is hard to override, so we just call it before hand. Otherwise,
@@ -518,11 +490,6 @@ TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoFinished) {
 }

 TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoStuck) {
-  // Need to have this check for every test to make sure we only run the test
-  // when there are GPUs available.
-  if (skipTest()) {
-    return;
-  }
  ProcessGroupNCCLDebugInfoStuck pg(store_, 0, 1, options_);
  // Need to keep main thread sleep longer so that we can let heartbeat monitor
  // thread to finish the extra wait and flip the flag.