diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b63d0ce3..f29b42ca 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -4,6 +4,9 @@ on: pull_request: branches: - main + push: + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a9bbcd38..14b8ff52 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,6 +4,9 @@ on: pull_request: branches: - main + push: + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/ci_check/change_files.py b/ci_check/change_files.py index 25b69cf7..2f197a95 100644 --- a/ci_check/change_files.py +++ b/ci_check/change_files.py @@ -40,7 +40,15 @@ def main(): "modifications": [ ( "torch.cuda.empty_cache()", - "if use_cuda: torch.cuda.empty_cache()", + "if use_cuda: torch.cuda.empty_cache()" + ), + ( + "init_process_group(backend='nccl')", + "init_process_group(backend='gloo')" + ), + ( + "torch.cuda.set_device(int(os.environ['LOCAL_RANK']))", + "# torch.cuda.set_device(int(os.environ['LOCAL_RANK']))" ) ], } diff --git a/ci_check/run.sh b/ci_check/run.sh index 24dc9da9..d5ad5dcb 100644 --- a/ci_check/run.sh +++ b/ci_check/run.sh @@ -4,7 +4,22 @@ current_directory=$(pwd) llmc=$(echo "$current_directory" | sed 's/\/ci_check$//') export PYTHONPATH=$llmc:$PYTHONPATH +config=${llmc}/ci_check/awq_w4a16_fakequant_eval.yml + +nnodes=1 +nproc_per_node=1 +MASTER_ADDR=127.0.0.1 +MASTER_PORT=$((10000 + RANDOM % 20000)) + +RANDOM=$(python -c 'import uuid; print(uuid.uuid4())') +task_id=$RANDOM cd ../scripts -python -m llmc --config ../ci_check/awq_w4a16_fakequant_eval.yml +torchrun \ + --nnodes $nnodes \ + --nproc_per_node $nproc_per_node \ + --rdzv_id $task_id \ + --rdzv_backend c10d \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + ${llmc}/llmc/__main__.py --config $config --task_id $task_id \