Skip to content

Commit

Permalink
GPU test for missing keys
Browse files Browse the repository at this point in the history
  • Loading branch information
jdh4 committed Nov 5, 2024
1 parent 68dd5f9 commit 3287b29
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 29 deletions.
27 changes: 14 additions & 13 deletions jobstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,18 +302,19 @@ def parse_stats(self):
self.cpu_util_error_code = 0
self.cpu_util__node_used_alloc_cores = []
for n in sp_node:
try:
d = sp_node[n]
if 'total_time' in d and 'cpus' in d:
used = sp_node[n]['total_time']
cores = sp_node[n]['cpus']
except Exception:
self.cpu_util_error_code = 1
break
else:
alloc = self.diff * cores
total += alloc
total_used += used
total_cores += cores
self.cpu_util__node_used_alloc_cores.append((n, used, alloc, cores))
else:
self.cpu_util_error_code = 1
self.cpu_util__node_used_alloc_cores.append((n, None, None, None))
break
if self.cpu_util_error_code == 0:
if total_used > total:
self.cpu_util_error_code = 2
Expand All @@ -328,18 +329,19 @@ def parse_stats(self):
self.cpu_mem_error_code = 0
self.cpu_mem__node_used_alloc_cores = []
for n in sp_node:
try:
d = sp_node[n]
if 'used_memory' in d and 'total_memory' in d and 'cpus' in d:
used = sp_node[n]['used_memory']
alloc = sp_node[n]['total_memory']
cores = sp_node[n]['cpus']
except Exception:
self.cpu_mem_error_code = 1
break
else:
total += alloc
total_used += used
total_cores += cores
self.cpu_mem__node_used_alloc_cores.append((n, used, alloc, cores))
else:
self.cpu_mem_error_code = 1
self.cpu_mem__node_used_alloc_cores.append((n, None, None, None))
break
if self.cpu_mem_error_code == 0:
if total_used > total:
self.cpu_mem_error_code = 2
Expand Down Expand Up @@ -369,14 +371,14 @@ def parse_stats(self):
break
self.gpu_util_total__util_gpus = (overall, overall_gpu_count)

# gpu memory usage
# gpu memory
overall = 0
overall_total = 0
self.gpu_mem_error_code = 0
self.gpu_mem__node_used_total_index = []
for n in sp_node:
d = sp_node[n]
if 'gpu_total_memory' in d and 'gpu_total_memory' in d:
if 'gpu_used_memory' in d and 'gpu_total_memory' in d:
gpus = list(d['gpu_total_memory'].keys())
gpus.sort()
for g in gpus:
Expand All @@ -396,7 +398,6 @@ def parse_stats(self):
self.gpu_mem_error_code == 3
self.gpu_mem_total__used_alloc = (overall, overall_total)


def __str__(self, compact=False):
js_data = {'nodes': self.sp_node, 'total_time': self.diff, 'gpus': self.gpus}
if compact:
Expand Down
5 changes: 0 additions & 5 deletions output_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,15 +380,10 @@ def output(self, no_color: bool=True) -> str:
heading = f"{self.txt_bold}Overall Utilization{self.txt_normal}"
report += heading.center(self.width) + "\n"
report += self.width * "=" + "\n"
# overall CPU time utilization
report += self.output_overall_cpu_util()
# overall CPU memory utilization
report += self.output_overall_cpu_memory_usage()
# GPUs
if self.js.gpus:
# overall GPU utilization
report += self.output_overall_gpu_util()
# overall GPU memory usage
report += self.output_overall_gpu_memory_usage()
report += "\n"
########################################################################
Expand Down
126 changes: 115 additions & 11 deletions tests/test_jobstats.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import base64
import gzip
import json
from jobstats import Jobstats
import pytest
from jobstats import Jobstats


@pytest.fixture
def simple_stats(mocker):
def simple_cpu_job(mocker):
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
ss64 = ('JS1:H4sIADelIWcC/1WNQQqDMBBF7zLrtEzG0ZhcphQzqGBM0bgQyd0bUii4fe8'
Expand All @@ -21,6 +21,24 @@ def simple_stats(mocker):
return stats


@pytest.fixture
def cpu_total_time_missing(mocker):
"""A job where the total_time key is missing in the JSON."""
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
nodes = {"della-r1c4n3": {"cpus": 15, "total_memory": 1073741824}}
js_data = {"gpus": 0, "nodes": nodes, "total_time": 36322}
data = json.dumps(js_data, sort_keys=True, indent=4)
ss64 = "JS1:" + base64.b64encode(gzip.compress(data.encode('ascii'))).decode('ascii')
data = ('49589697|1690927903|1690964225|della|billing=15,cpu=15,mem=1G,n'
'ode=1|%s|aturing|chem|TIMEOUT|1|15|1G|short|cpu|600|scf.cmd\n'
% ss64)
sacct_bytes = bytes(cols + data, "utf-8")
mocker.patch("subprocess.check_output", return_value=sacct_bytes)
stats = Jobstats(jobid="49589697", prom_server="DUMMY-SERVER")
return stats


@pytest.fixture
def cpu_mem_over_100_percent(mocker):
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
Expand All @@ -39,9 +57,9 @@ def cpu_mem_over_100_percent(mocker):

@pytest.fixture
def cpu_used_memory_missing(mocker):
"""A job where the used_memory key is missing in the JSON."""
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
# used_memory was removed
nodes = {"della-r1c4n3": {"cpus": 15, "total_memory": 1073741824, "total_time": 505333.1}}
js_data = {"gpus": 0, "nodes": nodes, "total_time": 36322}
data = json.dumps(js_data, sort_keys=True, indent=4)
Expand All @@ -57,9 +75,9 @@ def cpu_used_memory_missing(mocker):

@pytest.fixture
def cpu_total_memory_is_zero(mocker):
"""A job where total_memory is zero."""
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
# total memory is zero
nodes = {"della-r1c4n3": {"cpus": 15, "used_memory": 1073741824, "total_memory": 0}}
js_data = {"gpus": 0, "nodes": nodes, "total_time": 36322}
data = json.dumps(js_data, sort_keys=True, indent=4)
Expand All @@ -73,15 +91,88 @@ def cpu_total_memory_is_zero(mocker):
return stats


def test_simple_job(mocker, simple_stats):
total_cpu_time = simple_stats.diff * int(simple_stats.ncpus)
@pytest.fixture
def simple_gpu_job(mocker):
"""A GPU job without issues."""
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
ss64 = ('JS1:H4sIAKA3KmcC/12OTQ6DIBCF7zJra4AZYPAyxlRiSFBMi4vWcPdqbZvY5cv'
'73s8KU+r9HZoVeh9jd4lCDhJ3nVPuYjv6Md0e0Ei0hOw0klUVLHff/ywmw2iFUr'
'tzpHIYPTTaGiVqruA6L9uE3OxhXtpz8Qpyq9DsHJJ2SpQDOi0cDKIkNqwUfZkcY'
'nh2OaTpwziuTSnl74ZwRO/EfqK8APTfhRDzAAAA')
data = ('60155093|1730723367|1730774311|della|billing=10485,cpu=12,gres/'
'gpu=1,mem=128G,node=1|%s|aturing|cs|COMPLETED|1|12|128G|gpu-sho'
'rt|gpu-shared|990|emb\n' % ss64)
sacct_bytes = bytes(cols + data, "utf-8")
mocker.patch("subprocess.check_output", return_value=sacct_bytes)
stats = Jobstats(jobid="60155093", prom_server="DUMMY-SERVER")
return stats


@pytest.fixture
def gpu_utilization_missing(mocker):
"""A job where the gpu_utilization key is missing in the JSON."""
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
ss64 = ('JS1:H4sIAMnXKGcC/1WNWwqDMBQF93K/05Kbk6jNZoqYiwiJEY0fItl7HxSKn8P'
'M4Zw05yAb+ZOCxNjfVh7sjA+XXPr4TJLyepBn3aK13BmraN8k/MWjg3XMaNRvUq'
'Yk5J12AO6saFj29wG7Wq8FGhijaPxqXV+1b7sLiwAAAA==')
data = ('45122291|1674651996|1674709902|della|billing=2621,cpu=1,gres/gp'
'u=1,mem=32G,node=1|%s|aturing|ee|TIMEOUT|1|1|32G|gpu-short|mig|'
'960|sys/dashboard/sys/jupyter\n' % ss64)
sacct_bytes = bytes(cols + data, "utf-8")
mocker.patch("subprocess.check_output", return_value=sacct_bytes)
stats = Jobstats(jobid="45122291", prom_server="DUMMY-SERVER")
return stats


@pytest.fixture
def gpu_used_memory_missing(mocker):
"""A job where the used_memory key is missing in the JSON."""
cols = ('JobIDRaw|Start|End|Cluster|AllocTRES|AdminComment|User|Account|'
'State|NNodes|NCPUS|ReqMem|QOS|Partition|TimelimitRaw|JobName\n')
js_data = {
"gpus": 1,
"nodes": {
"della-l04g11": {
"cpus": 1,
"gpu_total_memory": {
"2": 85899345920
},
"gpu_utilization": {
"2": 0
},
"total_memory": 268435456000,
"total_time": 11423.0,
"used_memory": 10614919168
}
},
"total_time": 11540
}
data = json.dumps(js_data, sort_keys=True, indent=4)
ss64 = "JS1:" + base64.b64encode(gzip.compress(data.encode('ascii'))).decode('ascii')
data = ('46915114|1681342706|1681354246|della|billing=20480,cpu=1,gres/g'
'pu=1,mem=250G,node=1|%s|aturing|ee|CANCELLED by 223988|1|1|250G'
'|gpu-short|gpu|840|run_main_1.cmd\n' % ss64)
sacct_bytes = bytes(cols + data, "utf-8")
mocker.patch("subprocess.check_output", return_value=sacct_bytes)
stats = Jobstats(jobid="46915114", prom_server="DUMMY-SERVER")
return stats


def test_simple_cpu_job(mocker, simple_cpu_job):
total_cpu_time = simple_cpu_job.diff * int(simple_cpu_job.ncpus)
expected = (79498.2, total_cpu_time, 40)
assert simple_stats.cpu_util_total__used_alloc_cores == expected
assert simple_stats.cpu_util_error_code == 0
assert simple_cpu_job.cpu_util_total__used_alloc_cores == expected
assert simple_cpu_job.cpu_util_error_code == 0
expected = (5624422400, 10737418240, 40)
assert simple_stats.cpu_mem_total__used_alloc_cores == expected
assert simple_stats.cpu_mem_error_code == 0
assert simple_stats.gpus == 0
assert simple_cpu_job.cpu_mem_total__used_alloc_cores == expected
assert simple_cpu_job.cpu_mem_error_code == 0
assert simple_cpu_job.gpus == 0


def test_cpu_total_time_missing(mocker, cpu_total_time_missing):
assert cpu_total_time_missing.cpu_mem_error_code == 1


def test_cpu_used_memory_missing(mocker, cpu_used_memory_missing):
Expand All @@ -94,3 +185,16 @@ def test_cpu_mem_over_100_percent(mocker, cpu_mem_over_100_percent):

def test_cpu_total_memory_is_zero(mocker, cpu_total_memory_is_zero):
assert cpu_total_memory_is_zero.cpu_mem_error_code == 3


def test_simple_gpu_job(mocker, simple_gpu_job):
assert simple_gpu_job.gpu_util_error_code == 0
assert simple_gpu_job.gpu_mem_error_code == 0


def test_gpu_utilization_missing(mocker, gpu_utilization_missing):
assert gpu_utilization_missing.gpu_util_error_code == 1


def test_gpu_used_memory_missing(mocker, gpu_used_memory_missing):
assert gpu_used_memory_missing.gpu_mem_error_code == 1

0 comments on commit 3287b29

Please sign in to comment.