Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9d46fe6

Browse filesBrowse files
Revert "[c10d] PGNCCL refactor part 1: adds assert size==1 (#119099)"
This reverts commit 4ab852b. Reverted #119099 on behalf of https://github.com/atalman due to Breaks internal tests ([comment](#119099 (comment)))
1 parent 0f68bca commit 9d46fe6
Copy full SHA for 9d46fe6

File tree

Expand file treeCollapse file tree

4 files changed

+209
-161
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+209
-161
lines changed

‎test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp

Copy file name to clipboardExpand all lines: test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
+15-2Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,12 +254,16 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
254254
void SetUp() override {
255255
// Enable LOG(INFO) messages.
256256
c10::initLogging();
257-
size_t numDevices = 1; // One device per rank (thread)
257+
size_t numDevices = cudaNumDevices();
258258
TemporaryFile file;
259259
store_ = c10::make_intrusive<::c10d::FileStore>(file.path, 1);
260260

261+
at::cuda::OptionalCUDAGuard deviceGuard;
261262
tensors_.resize(numDevices);
262-
tensors_[0] = at::empty({3, 3}, at::kCUDA);
263+
for (const auto i : c10::irange(numDevices)) {
264+
deviceGuard.set_index(i);
265+
tensors_[i] = at::ones({3, 3}, at::kCUDA);
266+
}
263267
}
264268

265269
void TearDown() override {
@@ -282,6 +286,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
282286

283287
auto work = pg.allreduce(tensors_);
284288
work->wait();
289+
EXPECT_TRUE(work->isSuccess());
285290
EXPECT_EQ(1, pg.getNCCLCommCacheSize());
286291

287292
// Now run all reduce with errors.
@@ -291,6 +296,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
291296

292297
// Verify the work item failed.
293298
EXPECT_TRUE(work->isCompleted());
299+
EXPECT_FALSE(work->isSuccess());
294300
EXPECT_THROW(work->wait(), std::runtime_error);
295301

296302
// Communicators might be aborted here, further operations would fail.
@@ -308,6 +314,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
308314

309315
auto work = pg.allreduce(tensors_);
310316
work->wait();
317+
EXPECT_TRUE(work->isSuccess());
311318
EXPECT_EQ(1, pg.getNCCLCommCacheSize());
312319

313320
// Now run all reduce with errors.
@@ -329,6 +336,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
329336

330337
auto work = pg.allreduce(tensors_);
331338
pg.barrier()->wait();
339+
EXPECT_TRUE(work->isSuccess());
332340
EXPECT_EQ(1, pg.getNCCLCommCacheSize());
333341

334342
// Now run all reduce with errors.
@@ -339,7 +347,10 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
339347
work->wait();
340348
pg.barrier()->wait();
341349

350+
// Verify the work item failed.
342351
EXPECT_TRUE(work->isCompleted());
352+
EXPECT_FALSE(work->isSuccess());
353+
343354
// Communicators might be aborted here, further operations would fail.
344355
}
345356

@@ -415,6 +426,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
415426
// Normal collective case.
416427
auto work = pg.allreduce(tensors_);
417428
work->wait();
429+
EXPECT_TRUE(work->isSuccess());
418430

419431
work = pg.allreduce(tensors_);
420432
{
@@ -428,6 +440,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
428440
EXPECT_TRUE(pg.getErrorCaughtFlag());
429441
}
430442
work->wait();
443+
EXPECT_TRUE(work->isSuccess());
431444
EXPECT_TRUE(traces.size() > 0);
432445
auto filename = c10::str(tempFilename, 0);
433446
auto traceFromStorage = readTraceFromFile(filename, traces.size());

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.