From 33415759b1b44de65bf0af2496cf4efd03ac8052 Mon Sep 17 00:00:00 2001 From: Stefan Doerr Date: Tue, 28 May 2024 14:32:25 +0300 Subject: [PATCH] update the job template to chdir before executing each of the run commands --- jobqueues/slurmqueue.py | 11 +++++++---- jobqueues/templates/SLURM_job.sh.j2 | 6 +++++- tests/test_slurmqueue.py | 16 ++++++---------- .../_slurm_queue_nvidia_mps_job.sh | 5 ++++- tests/test_slurmqueue/_submit_command.sh | 1 + tests/test_slurmqueue/_submit_folder.sh | 3 ++- tests/test_slurmqueue/_submit_multi_folder.sh | 2 +- 7 files changed, 26 insertions(+), 18 deletions(-) diff --git a/jobqueues/slurmqueue.py b/jobqueues/slurmqueue.py index acbd243..5c21a58 100644 --- a/jobqueues/slurmqueue.py +++ b/jobqueues/slurmqueue.py @@ -328,10 +328,15 @@ def _find_binary(binary, permissive=False): ret = os.path.abspath(ret) return ret - def _createJobScript(self, fname, workdir, runsh, nvidia_mps=False): + def _createJobScript(self, fname, workdir, runsh, nvidia_mps=False, commands=None): from jobqueues.config import template_env + # Create a list of lists with the directory of the run.sh and the run.sh itself runsh = ensurelist(runsh) + if commands is None: + runsh = [[os.path.dirname(os.path.abspath(x)), x] for x in runsh] + else: + runsh = [[workdir, runsh[0]]] workdir = os.path.abspath(workdir) sentinel = os.path.normpath(os.path.join(workdir, self._sentinel)) @@ -356,8 +361,6 @@ def _createJobScript(self, fname, workdir, runsh, nvidia_mps=False): errorstream = None outputstream = None sentinel = None - else: - prerun += [f"cd {workdir}"] template = template_env.get_template("SLURM_job.sh.j2") job_str = template.render( @@ -464,7 +467,7 @@ def submit(self, dirs, commands=None, _dryrun=False, nvidia_mps=False): self._cleanSentinel(d) jobscript = os.path.abspath(os.path.join(d, self.jobscript)) - self._createJobScript(jobscript, d, runscript) + self._createJobScript(jobscript, d, runscript, commands=commands) try: if _dryrun: logger.info(f"Dry run. Here it would call submit on {jobscript}") diff --git a/jobqueues/templates/SLURM_job.sh.j2 b/jobqueues/templates/SLURM_job.sh.j2 index 9c50c88..56acffb 100644 --- a/jobqueues/templates/SLURM_job.sh.j2 +++ b/jobqueues/templates/SLURM_job.sh.j2 @@ -73,9 +73,13 @@ unset CUDA_VISIBLE_DEVICES {% endif %} {% for rsh in runsh %} -{{ rsh }} {% if run_as_daemon %} | tee log_{{loop.index}}.txt &{% endif %} +cd {{ rsh[0] }} +{{ rsh[1] }} {% if run_as_daemon %} | tee log_{{loop.index}}.txt &{% endif %} {% endfor %} +{% if runsh|length > 1 %} +cd {{ workdir }} +{% endif %} {% if nvidia_mps %} wait diff --git a/tests/test_slurmqueue.py b/tests/test_slurmqueue.py index f463273..2683dd9 100644 --- a/tests/test_slurmqueue.py +++ b/tests/test_slurmqueue.py @@ -65,7 +65,8 @@ def _test_config(datadir): def _test_submit_command(datadir): - import tempfile + execdir = str(datadir.join("0")) + os.makedirs(execdir, exist_ok=True) sl = SlurmQueue(_findExecutables=False) sl.partition = "jobqueues_test" @@ -76,16 +77,11 @@ def _test_submit_command(datadir): sl.envvars = "TEST=3" sl.useworkdir = False - with tempfile.TemporaryDirectory() as tmpdir: - try: - sl.submit([tmpdir], commands=["sleep 5"]) - except Exception as e: - print(e) - pass + sl.submit([execdir], commands=["sleep 5"], _dryrun=True) - _compare_jobsh( - os.path.join(tmpdir, "job.sh"), datadir.join("_submit_command.sh"), tmpdir - ) + _compare_jobsh( + os.path.join(execdir, "job.sh"), datadir.join("_submit_command.sh"), datadir + ) def _test_submit_folder(datadir): diff --git a/tests/test_slurmqueue/_slurm_queue_nvidia_mps_job.sh b/tests/test_slurmqueue/_slurm_queue_nvidia_mps_job.sh index 1c59326..65b6967 100644 --- a/tests/test_slurmqueue/_slurm_queue_nvidia_mps_job.sh +++ b/tests/test_slurmqueue/_slurm_queue_nvidia_mps_job.sh @@ -13,7 +13,6 @@ trap "touch TESTDIR_PLACEHOLDER/0/jobqueues.done" EXIT SIGTERM -cd TESTDIR_PLACEHOLDER/0 # assume CUDA_VISIBLE_DEVICES has been set by slurm GPU=$CUDA_VISIBLE_DEVICES @@ -38,9 +37,13 @@ echo "start_server -uid ${UID}" | nvidia-cuda-mps-control unset CUDA_VISIBLE_DEVICES +cd TESTDIR_PLACEHOLDER/0 TESTDIR_PLACEHOLDER/0/run.sh | tee log_1.txt & +cd TESTDIR_PLACEHOLDER/1 TESTDIR_PLACEHOLDER/1/run.sh | tee log_2.txt & +cd TESTDIR_PLACEHOLDER/2 TESTDIR_PLACEHOLDER/2/run.sh | tee log_3.txt & +cd TESTDIR_PLACEHOLDER/0 wait # quit the server and the control. It will only quit the one corresponding to the CUDA_MPS_PIPE_DIRECTORY env variable diff --git a/tests/test_slurmqueue/_submit_command.sh b/tests/test_slurmqueue/_submit_command.sh index 4eaec68..7d97fca 100644 --- a/tests/test_slurmqueue/_submit_command.sh +++ b/tests/test_slurmqueue/_submit_command.sh @@ -14,4 +14,5 @@ +cd TESTDIR_PLACEHOLDER/0 sleep 5 diff --git a/tests/test_slurmqueue/_submit_folder.sh b/tests/test_slurmqueue/_submit_folder.sh index de43c69..015dccc 100644 --- a/tests/test_slurmqueue/_submit_folder.sh +++ b/tests/test_slurmqueue/_submit_folder.sh @@ -15,7 +15,8 @@ trap "touch TESTDIR_PLACEHOLDER/0/jobqueues.done" EXIT SIGTERM -cd TESTDIR_PLACEHOLDER/0 +cd TESTDIR_PLACEHOLDER/0 TESTDIR_PLACEHOLDER/0/run.sh + diff --git a/tests/test_slurmqueue/_submit_multi_folder.sh b/tests/test_slurmqueue/_submit_multi_folder.sh index c53e89b..d252d40 100644 --- a/tests/test_slurmqueue/_submit_multi_folder.sh +++ b/tests/test_slurmqueue/_submit_multi_folder.sh @@ -15,8 +15,8 @@ trap "touch TESTDIR_PLACEHOLDER/jobqueues.done" EXIT SIGTERM -cd TESTDIR_PLACEHOLDER +cd TESTDIR_PLACEHOLDER TESTDIR_PLACEHOLDER/run.sh