Skip to content

Commit 8be76b2

Browse files
committed
Changed runner to linux.g4dn.12xlarge.nvidia.gpu
1 parent 6e96986 commit 8be76b2

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

.github/workflows/gpu-tests.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
DOCKER_IMAGE: "pytorch/almalinux-builder:cuda12.4"
2929
REPOSITORY: ${{ github.repository }}
3030
PR_NUMBER: ${{ github.event.pull_request.number }}
31-
runs-on: linux.8xlarge.nvidia.gpu
31+
runs-on: linux.g4dn.12xlarge.nvidia.gpu
3232
timeout-minutes: 85
3333

3434
steps:
@@ -126,8 +126,8 @@ jobs:
126126
max_attempts: 5
127127
timeout_minutes: 45
128128
shell: bash
129-
command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2'
130-
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2'
129+
command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 4'
130+
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 4'
131131

132132
- name: Upload coverage to Codecov
133133
uses: codecov/codecov-action@v3
@@ -154,15 +154,15 @@ jobs:
154154
155155
# Check training on cifar10, run with NCCL backend using torchrun
156156
## initial run
157-
CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
157+
CI=1 torchrun --nproc_per_node=4 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
158158
## resume
159-
CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
159+
CI=1 torchrun --nproc_per_node=4 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
160160
161161
# Check training on cifar10, run with NCCL backend using spawn
162162
## initial run
163-
CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
163+
CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=4 --checkpoint_every=200 --stop_iteration=500
164164
## resume
165-
CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
165+
CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=4 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
166166
167167
EOF
168168
)

0 commit comments

Comments
 (0)