Fix group as list of ints in torch dist collective ops

vfdev-5 · vfdev-5 · commit b1157784cb49 · 2025-03-03T15:04:57.000+01:00
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -408,6 +408,15 @@ def spawn(
                 **spawn_kwargs,
             )
 
+        def _setup_group(self, group: Optional[Any]) -> dist.ProcessGroup:
+            if isinstance(group, list) and all(isinstance(item, int) for item in group):
+                group = self._do_new_group(group)
+            if not (isinstance(group, dist.ProcessGroup) or group == dist.GroupMember.NON_GROUP_MEMBER):
+                raise ValueError(
+                    f"Argument group should be list of int or ProcessGroup, got {type(group)}, group={group}"
+                )
+            return group
+
         _reduce_op_map = {
             "SUM": dist.ReduceOp.SUM,
             "PRODUCT": dist.ReduceOp.PRODUCT,
@@ -420,8 +429,8 @@ def spawn(
         def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[Any] = None) -> torch.Tensor:
             if op not in self._reduce_op_map:
                 raise ValueError(f"Unsupported reduction operation: '{op}'")
-            if group is not None and not isinstance(group, dist.ProcessGroup):
-                raise ValueError("Argument group should be list of int or ProcessGroup")
+            if group is not None:
+                group = self._setup_group(group)
             reduce_op = self._reduce_op_map[op]
             # We do if/else here for compatibility with older pytorch versions
             if group is not None:
@@ -431,15 +440,14 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[
             return tensor
 
         def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
+            if group is not None:
+                group = self._setup_group(group)
             if group == dist.GroupMember.NON_GROUP_MEMBER:
                 return tensor
-
             if group is None:
                 group_size = self.get_world_size()
-            elif isinstance(group, dist.ProcessGroup):
-                group_size = group.size()
             else:
-                raise ValueError("Argument group should be list of int or ProcessGroup")
+                group_size = group.size()
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
             output = [torch.zeros_like(tensor) for _ in range(group_size)]
@@ -456,16 +464,14 @@ def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> Lis
                     "Current torch version does not implement dist.all_gather_object. "
                     "Required version should be >=1.7.0"
                 )
-
+            if group is not None:
+                group = self._setup_group(group)
             if group == dist.GroupMember.NON_GROUP_MEMBER:
                 return tensor
-
             if group is None:
                 group_size = self.get_world_size()
-            elif isinstance(group, dist.ProcessGroup):
-                group_size = group.size()
             else:
-                raise ValueError("Argument group should be list of int or ProcessGroup")
+                group_size = group.size()
             output = [None for _ in range(group_size)]
             # We do if/else here for compatibility with older pytorch versions
             if group is not None:
diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py
@@ -347,9 +347,6 @@ def all_reduce(
     if _need_to_sync and isinstance(_model, _SerialModel):
         sync(temporary=True)
 
-    if isinstance(group, list) and all(isinstance(item, int) for item in group):
-        group = _model.new_group(group)
-
     return _model.all_reduce(tensor, op, group=group)
 
 
@@ -429,9 +426,6 @@ def all_gather(
     if _need_to_sync and isinstance(_model, _SerialModel):
         sync(temporary=True)
 
-    if isinstance(group, list) and all(isinstance(item, int) for item in group):
-        group = _model.new_group(group)
-
     return _model.all_gather(tensor, group=group)
 
 
diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py
@@ -119,39 +119,44 @@ def _test_distrib_all_reduce(device):
 
 
 def _test_distrib_all_reduce_group(device):
-    if idist.get_world_size() > 1 and idist.backend() is not None:
-        ranks = [0, 1]
-        rank = idist.get_rank()
-        t = torch.tensor([rank], device=device)
-        bnd = idist.backend()
+    assert idist.get_world_size() > 1, idist.get_world_size()
+    assert idist.backend() is not None, idist.backend()
 
-        group = idist.new_group(ranks)
-        if bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
-                res = idist.all_reduce(t, group=group)
-        else:
+    ranks = [0, 1]
+    rank = idist.get_rank()
+    t = torch.tensor([rank], device=device)
+    bnd = idist.backend()
+
+    group = idist.new_group(ranks)
+    if bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
+            res = idist.all_reduce(t, group=group)
+    else:
+        if rank in ranks:
+            # we should call all_reduce with group on the participating ranks only
+            # otherwise a warning is raised:
+            # UserWarning: Running all_reduce on global rank 2 which does not belong to the given group.
             res = idist.all_reduce(t, group=group)
             assert res == torch.tensor([sum(ranks)], device=device)
 
-        t = torch.tensor([rank], device=device)
-        if bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
-                res = idist.all_reduce(t, group=ranks)
-        else:
+    t = torch.tensor([rank], device=device)
+    if bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
+            res = idist.all_reduce(t, group=ranks)
+    else:
+        if rank in ranks:
             res = idist.all_reduce(t, group=ranks)
             assert res == torch.tensor([sum(ranks)], device=device)
 
-        ranks = "abc"
-
-        if bnd in ("nccl", "gloo", "mpi"):
-            with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"):
-                res = idist.all_reduce(t, group="abc")
-        elif bnd in ("xla-tpu"):
-            with pytest.raises(ValueError, match=r"Argument group should be list of int"):
-                res = idist.all_reduce(t, group="abc")
-        elif bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
-                res = idist.all_reduce(t, group="abc")
+    if bnd in ("nccl", "gloo", "mpi"):
+        with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"):
+            idist.all_reduce(t, group="abc")
+    elif bnd in ("xla-tpu"):
+        with pytest.raises(ValueError, match=r"Argument group should be list of int"):
+            idist.all_reduce(t, group="abc")
+    elif bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
+            idist.all_reduce(t, group="abc")
 
 
 def _test_distrib_all_gather(device):
@@ -218,77 +223,76 @@ def _test_distrib_all_gather(device):
 
 
 def _test_distrib_all_gather_group(device):
-    if idist.get_world_size() > 1:
-        ranks = list(range(idist.get_world_size() - 1, 0, -1))  # [0, 1, 2, 3] -> [3, 2, 1]
-        rank = idist.get_rank()
-        bnd = idist.backend()
+    assert idist.get_world_size() > 1, idist.get_world_size()
 
-        t = torch.tensor([rank], device=device)
-        group = idist.new_group(ranks)
-        if bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
-                res = idist.all_gather(t, group=group)
-        else:
-            res = idist.all_gather(t, group=group)
-            if rank in ranks:
-                assert torch.equal(res, torch.tensor(ranks, device=device))
-            else:
-                assert res == t
+    ranks = list(range(idist.get_world_size() - 1, 0, -1))  # [0, 1, 2, 3] -> [3, 2, 1]
+    rank = idist.get_rank()
+    bnd = idist.backend()
 
-        t = torch.tensor([rank], device=device)
-        if bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
-                res = idist.all_gather(t, group=ranks)
+    t = torch.tensor([rank], device=device)
+    group = idist.new_group(ranks)
+    if bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
+            res = idist.all_gather(t, group=group)
+    else:
+        res = idist.all_gather(t, group=group)
+        if rank in ranks:
+            assert torch.equal(res, torch.tensor(sorted(ranks), device=device)), res
         else:
-            res = idist.all_gather(t, group=ranks)
-            if rank in ranks:
-                assert torch.equal(res, torch.tensor(ranks, device=device))
-            else:
-                assert res == t
+            assert res == t
 
-        t = {
-            "a": [rank + 1, rank + 2, torch.tensor(rank + 3, device=device)],
-            "b": torch.tensor([[rank + 1, rank + 2, rank + 3]], device=device),
-            "c": {"abcd": rank, "cdfg": torch.tensor(rank, dtype=torch.uint8, device=device)},
-        }
-        if bnd in ("xla-tpu"):
-            with pytest.raises(NotImplementedError, match=r"all_gather on object is not implemented for xla"):
-                res = idist.all_gather(t, group=ranks)
-        elif bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
-                res = idist.all_gather(t, group=ranks)
+    t = torch.tensor([rank], device=device)
+    if bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
+            res = idist.all_gather(t, group=ranks)
+    else:
+        res = idist.all_gather(t, group=ranks)
+        if rank in ranks:
+            assert torch.equal(res, torch.tensor(sorted(ranks), device=device))
         else:
+            assert res == t
+
+    t = {
+        "a": [rank + 1, rank + 2, torch.tensor(rank + 3, device=device)],
+        "b": torch.tensor([[rank + 1, rank + 2, rank + 3]], device=device),
+        "c": {"abcd": rank, "cdfg": torch.tensor(rank, dtype=torch.uint8, device=device)},
+    }
+    if bnd in ("xla-tpu"):
+        with pytest.raises(NotImplementedError, match=r"all_gather on object is not implemented for xla"):
             res = idist.all_gather(t, group=ranks)
-            if rank in ranks:
-                assert isinstance(res, list) and len(res) == len(ranks)
-                for i, obj in zip(ranks, res):
-                    assert isinstance(obj, dict)
-                    assert list(obj.keys()) == ["a", "b", "c"], obj
-                    expected_device = (
-                        device
-                        if torch.device(device).type == "cpu"
-                        else torch.device(f"{torch.device(device).type}:{i}")
-                    )
-                    expected = {
-                        "a": [i + 1, i + 2, torch.tensor(i + 3, device=expected_device)],
-                        "b": torch.tensor([[i + 1, i + 2, i + 3]], device=expected_device),
-                        "c": {"abcd": i, "cdfg": torch.tensor(i, dtype=torch.uint8, device=expected_device)},
-                    }
-                    assert obj["a"] == expected["a"], (obj, expected)
-                    assert (obj["b"] == expected["b"]).all(), (obj, expected)
-                    assert obj["c"] == expected["c"], (obj, expected)
-            else:
-                assert res == t
-
-        if bnd in ("nccl", "gloo", "mpi"):
-            with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"):
-                res = idist.all_gather(t, group="abc")
-        elif bnd in ("xla-tpu"):
-            with pytest.raises(ValueError, match=r"Argument group should be list of int"):
-                res = idist.all_gather(t, group="abc")
-        elif bnd in ("horovod"):
-            with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
-                res = idist.all_gather(t, group="abc")
+    elif bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
+            res = idist.all_gather(t, group=ranks)
+    else:
+        res = idist.all_gather(t, group=ranks)
+        if rank in ranks:
+            assert isinstance(res, list) and len(res) == len(ranks)
+            for i, obj in zip(sorted(ranks), res):
+                assert isinstance(obj, dict)
+                assert list(obj.keys()) == ["a", "b", "c"], obj
+                expected_device = (
+                    device if torch.device(device).type == "cpu" else torch.device(f"{torch.device(device).type}:{i}")
+                )
+                expected = {
+                    "a": [i + 1, i + 2, torch.tensor(i + 3, device=expected_device)],
+                    "b": torch.tensor([[i + 1, i + 2, i + 3]], device=expected_device),
+                    "c": {"abcd": i, "cdfg": torch.tensor(i, dtype=torch.uint8, device=expected_device)},
+                }
+                assert obj["a"] == expected["a"], (obj, expected)
+                assert (obj["b"] == expected["b"]).all(), (obj, expected)
+                assert obj["c"] == expected["c"], (obj, expected)
+        else:
+            assert res == t
+
+    if bnd in ("nccl", "gloo", "mpi"):
+        with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"):
+            res = idist.all_gather(t, group="abc")
+    elif bnd in ("xla-tpu"):
+        with pytest.raises(ValueError, match=r"Argument group should be list of int"):
+            res = idist.all_gather(t, group="abc")
+    elif bnd in ("horovod"):
+        with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
+            res = idist.all_gather(t, group="abc")
 
 
 def _test_idist_all_gather_tensors_with_shapes(device):
diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh
@@ -22,7 +22,7 @@ fi
 # Run 2 processes with --dist=each
 CUDA_VISIBLE_DEVICES="" run_tests \
     --core_args "-m distributed -vvv tests/ignite" \
-    --world_size 2 \
+    --world_size 4 \
     --cache_dir ".cpu-distrib" \
     --skip_distrib_tests 0 \
     --use_coverage 1 \