Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

[c10d] PGNCCL refactor part 1: adds assert size==1 #119099

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add assert size==1 to PGNCCL
  • Loading branch information
kwen2501 committed Feb 2, 2024
commit cfdb92248bfd2d3dd4af387d3d2a0a97b40786dc
17 changes: 17 additions & 0 deletions 17 torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,13 @@ void ProcessGroupNCCL::WorkNCCL::abort() {

static std::atomic<size_t> process_group_id = 0;

constexpr const char* MULTI_DEVICE_ERROR_MSG =
"Expecting one tensor only but got multiple. You are probably using multiple "
"devices under one thread. The support for such usage has been deprecated. "
"For details, please refer to "
"https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. "
"ProcessGroupNCCL continues supporting multi-process and multi-thread modes.";

ProcessGroupNCCL::ProcessGroupNCCL(
const c10::intrusive_ptr<Store>& store,
int rank,
Expand Down Expand Up @@ -2831,6 +2838,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
std::vector<at::Tensor>& tensors,
const AllreduceOptions& opts) {
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
#ifdef IS_NCCL_EXP
std::vector<at::Tensor> outputTensors(tensors.size());
for (std::vector<at::Tensor>::size_type i = 0; i < tensors.size(); i++) {
Expand Down Expand Up @@ -2934,6 +2942,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_impl(
c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
std::vector<at::Tensor>& tensors,
const AllreduceOptions& opts) {
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
if (intraNodeComm_ != nullptr && tensors.size() == 1 &&
opts.reduceOp == ReduceOp::SUM) {
using namespace intra_node_comm;
Expand Down Expand Up @@ -3000,6 +3009,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
std::vector<at::Tensor>& tensors,
const BroadcastOptions& opts) {
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
check_gpu_tensors_different_devices(tensors);

// @lint-ignore CLANGTIDY
Expand Down Expand Up @@ -3110,6 +3120,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
std::vector<at::Tensor>& tensors,
const ReduceOptions& opts) {
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
check_gpu_tensors_different_devices(tensors);
// @lint-ignore CLANGTIDY
auto tensor = tensors.back();
Expand Down Expand Up @@ -3226,6 +3237,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
std::vector<std::vector<at::Tensor>>& outputTensors,
std::vector<at::Tensor>& inputTensors,
const AllgatherOptions& opts) {
TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
check_gpu_tensors_different_devices(inputTensors);
// @lint-ignore CLANGTIDY
bool same_size = check_same_size(outputTensors.back());
Expand Down Expand Up @@ -3370,6 +3382,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
std::vector<at::Tensor>& outputTensors,
std::vector<std::vector<at::Tensor>>& inputTensors,
const ReduceScatterOptions& opts) {
TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
check_gpu_tensors_different_devices(outputTensors);
// @lint-ignore CLANGTIDY
bool same_size = check_same_size(inputTensors.back());
Expand Down Expand Up @@ -3849,6 +3862,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
std::vector<at::Tensor>& tensors,
int dstRank,
int /* unused */) {
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
check_gpu_tensors_different_devices(tensors, true);

// @lint-ignore CLANGTIDY
Expand Down Expand Up @@ -3889,6 +3903,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
std::vector<at::Tensor>& tensors,
int srcRank,
int /* unused */) {
TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
check_gpu_tensors_different_devices(tensors, true);

// @lint-ignore CLANGTIDY
Expand Down Expand Up @@ -4021,6 +4036,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
check_gpu_tensors_different_devices(inputTensors, true);
assertSingleElementInput(invalidArgument, inputTensors);

TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
// @lint-ignore CLANGTIDY
auto tensor = inputTensors.back();

Expand Down Expand Up @@ -4110,6 +4126,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
assertSingleElementInput(invalidArgument, outputTensors);

// @lint-ignore CLANGTIDY
TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
auto tensor = outputTensors.back();

std::vector<at::Tensor> inputs;
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.