Skip to content

Commit

Permalink
update the job template to chdir before executing each of the run com…
Browse files Browse the repository at this point in the history
…mands
  • Loading branch information
stefdoerr committed May 28, 2024
1 parent e3a3297 commit 3341575
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 18 deletions.
11 changes: 7 additions & 4 deletions jobqueues/slurmqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,15 @@ def _find_binary(binary, permissive=False):
ret = os.path.abspath(ret)
return ret

def _createJobScript(self, fname, workdir, runsh, nvidia_mps=False):
def _createJobScript(self, fname, workdir, runsh, nvidia_mps=False, commands=None):
from jobqueues.config import template_env

# Create a list of lists with the directory of the run.sh and the run.sh itself
runsh = ensurelist(runsh)
if commands is None:
runsh = [[os.path.dirname(os.path.abspath(x)), x] for x in runsh]
else:
runsh = [[workdir, runsh[0]]]

workdir = os.path.abspath(workdir)
sentinel = os.path.normpath(os.path.join(workdir, self._sentinel))
Expand All @@ -356,8 +361,6 @@ def _createJobScript(self, fname, workdir, runsh, nvidia_mps=False):
errorstream = None
outputstream = None
sentinel = None
else:
prerun += [f"cd {workdir}"]

template = template_env.get_template("SLURM_job.sh.j2")
job_str = template.render(
Expand Down Expand Up @@ -464,7 +467,7 @@ def submit(self, dirs, commands=None, _dryrun=False, nvidia_mps=False):
self._cleanSentinel(d)

jobscript = os.path.abspath(os.path.join(d, self.jobscript))
self._createJobScript(jobscript, d, runscript)
self._createJobScript(jobscript, d, runscript, commands=commands)
try:
if _dryrun:
logger.info(f"Dry run. Here it would call submit on {jobscript}")
Expand Down
6 changes: 5 additions & 1 deletion jobqueues/templates/SLURM_job.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,13 @@ unset CUDA_VISIBLE_DEVICES
{% endif %}

{% for rsh in runsh %}
{{ rsh }} {% if run_as_daemon %} | tee log_{{loop.index}}.txt &{% endif %}
cd {{ rsh[0] }}
{{ rsh[1] }} {% if run_as_daemon %} | tee log_{{loop.index}}.txt &{% endif %}

{% endfor %}
{% if runsh|length > 1 %}
cd {{ workdir }}
{% endif %}

{% if nvidia_mps %}
wait
Expand Down
16 changes: 6 additions & 10 deletions tests/test_slurmqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def _test_config(datadir):


def _test_submit_command(datadir):
import tempfile
execdir = str(datadir.join("0"))
os.makedirs(execdir, exist_ok=True)

sl = SlurmQueue(_findExecutables=False)
sl.partition = "jobqueues_test"
Expand All @@ -76,16 +77,11 @@ def _test_submit_command(datadir):
sl.envvars = "TEST=3"
sl.useworkdir = False

with tempfile.TemporaryDirectory() as tmpdir:
try:
sl.submit([tmpdir], commands=["sleep 5"])
except Exception as e:
print(e)
pass
sl.submit([execdir], commands=["sleep 5"], _dryrun=True)

_compare_jobsh(
os.path.join(tmpdir, "job.sh"), datadir.join("_submit_command.sh"), tmpdir
)
_compare_jobsh(
os.path.join(execdir, "job.sh"), datadir.join("_submit_command.sh"), datadir
)


def _test_submit_folder(datadir):
Expand Down
5 changes: 4 additions & 1 deletion tests/test_slurmqueue/_slurm_queue_nvidia_mps_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

trap "touch TESTDIR_PLACEHOLDER/0/jobqueues.done" EXIT SIGTERM

cd TESTDIR_PLACEHOLDER/0

# assume CUDA_VISIBLE_DEVICES has been set by slurm
GPU=$CUDA_VISIBLE_DEVICES
Expand All @@ -38,9 +37,13 @@ echo "start_server -uid ${UID}" | nvidia-cuda-mps-control

unset CUDA_VISIBLE_DEVICES

cd TESTDIR_PLACEHOLDER/0
TESTDIR_PLACEHOLDER/0/run.sh | tee log_1.txt &
cd TESTDIR_PLACEHOLDER/1
TESTDIR_PLACEHOLDER/1/run.sh | tee log_2.txt &
cd TESTDIR_PLACEHOLDER/2
TESTDIR_PLACEHOLDER/2/run.sh | tee log_3.txt &
cd TESTDIR_PLACEHOLDER/0

wait
# quit the server and the control. It will only quit the one corresponding to the CUDA_MPS_PIPE_DIRECTORY env variable
Expand Down
1 change: 1 addition & 0 deletions tests/test_slurmqueue/_submit_command.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@



cd TESTDIR_PLACEHOLDER/0
sleep 5
3 changes: 2 additions & 1 deletion tests/test_slurmqueue/_submit_folder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

trap "touch TESTDIR_PLACEHOLDER/0/jobqueues.done" EXIT SIGTERM

cd TESTDIR_PLACEHOLDER/0


cd TESTDIR_PLACEHOLDER/0
TESTDIR_PLACEHOLDER/0/run.sh

2 changes: 1 addition & 1 deletion tests/test_slurmqueue/_submit_multi_folder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

trap "touch TESTDIR_PLACEHOLDER/jobqueues.done" EXIT SIGTERM

cd TESTDIR_PLACEHOLDER


cd TESTDIR_PLACEHOLDER
TESTDIR_PLACEHOLDER/run.sh

0 comments on commit 3341575

Please sign in to comment.