28
28
DOCKER_IMAGE : " pytorch/almalinux-builder:cuda12.4"
29
29
REPOSITORY : ${{ github.repository }}
30
30
PR_NUMBER : ${{ github.event.pull_request.number }}
31
- runs-on : linux.8xlarge .nvidia.gpu
31
+ runs-on : linux.g4dn.12xlarge .nvidia.gpu
32
32
timeout-minutes : 85
33
33
34
34
steps :
@@ -126,8 +126,8 @@ jobs:
126
126
max_attempts : 5
127
127
timeout_minutes : 45
128
128
shell : bash
129
- command : docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2 '
130
- new_command_on_retry : docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2 '
129
+ command : docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 4 '
130
+ new_command_on_retry : docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 4 '
131
131
132
132
- name : Upload coverage to Codecov
133
133
uses : codecov/codecov-action@v3
@@ -154,15 +154,15 @@ jobs:
154
154
155
155
# Check training on cifar10, run with NCCL backend using torchrun
156
156
## initial run
157
- CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
157
+ CI=1 torchrun --nproc_per_node=4 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
158
158
## resume
159
- CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
159
+ CI=1 torchrun --nproc_per_node=4 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
160
160
161
161
# Check training on cifar10, run with NCCL backend using spawn
162
162
## initial run
163
- CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
163
+ CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=4 --checkpoint_every=200 --stop_iteration=500
164
164
## resume
165
- CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
165
+ CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=4 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
166
166
167
167
EOF
168
168
)
0 commit comments