From e50f2d3463133db536e3204cd7c27b528122a4e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Berland?= Date: Tue, 12 Nov 2024 13:27:51 +0100 Subject: [PATCH] Assume non-LSF host error is flaky The LSF driver experiences crashes stemming from bsub returning with the error message 'Request from non-LSF host rejected'. There are reasons to believe this is not a permanent error, but some flakyness in the IP infrastructure, and thus should should be categorized as a retriable failure. The reason for believing this is flakyness is mostly from the fact that the same error is also seen on 'bjobs'-calls. If it was a permanent failure scenario, there would be an enourmous amount of error from these bjobs calls, but there is not. --- src/ert/scheduler/lsf_driver.py | 9 ++++++++- tests/ert/unit_tests/scheduler/test_lsf_driver.py | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ert/scheduler/lsf_driver.py b/src/ert/scheduler/lsf_driver.py index d5893bbaad1..2f45b2bde5c 100644 --- a/src/ert/scheduler/lsf_driver.py +++ b/src/ert/scheduler/lsf_driver.py @@ -94,7 +94,14 @@ class RunningJob: LSF_INFO_JSON_FILENAME = "lsf_info.json" FLAKY_SSH_RETURNCODE = 255 JOB_ALREADY_FINISHED_BKILL_MSG = "Job has already finished" -BSUB_FAILURE_MESSAGES = ("Job not submitted",) +BSUB_FAILURE_MESSAGES = ( + "Error in rusage section", + "Expeced number, string", + "No such queue", + "Too many processors requested", + "cannot be used in the resource requirement section", + "duplicate section", +) def _parse_jobs_dict(jobs: Mapping[str, JobState]) -> dict[str, AnyJob]: diff --git a/tests/ert/unit_tests/scheduler/test_lsf_driver.py b/tests/ert/unit_tests/scheduler/test_lsf_driver.py index 411e5649807..6d98b722f33 100644 --- a/tests/ert/unit_tests/scheduler/test_lsf_driver.py +++ b/tests/ert/unit_tests/scheduler/test_lsf_driver.py @@ -607,7 +607,6 @@ async def test_that_bsub_will_retry_and_fail( " '&' cannot be used in the resource requirement section. Job not submitted.", ), (255, "Error in rusage section. Job not submitted."), - (255, "Job not submitted."), ], ) async def test_that_bsub_will_fail_without_retries( @@ -633,6 +632,8 @@ async def test_that_bsub_will_fail_without_retries( [ (0, "void"), (FLAKY_SSH_RETURNCODE, ""), + (0, "Request from non-LSF host rejected"), + (FLAKY_SSH_RETURNCODE, "Request from non-LSF host rejected"), ], ) async def test_that_bsub_will_retry_and_succeed(