-
Notifications
You must be signed in to change notification settings - Fork 0
/
cryoem_gpu_usage.py
143 lines (121 loc) · 5.65 KB
/
cryoem_gpu_usage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/licensed/anaconda3/2023.9/bin/python -uB
import os
import re
import argparse
import subprocess
import smtplib
from datetime import datetime
from datetime import timedelta
from email.message import EmailMessage
import pandas as pd
def get_time_window(num_days: int) -> tuple[str, str, int]:
"""Find the start and end dates."""
today = datetime.now()
start_date = today - timedelta(days=num_days)
py_start_date = datetime(start_date.year, start_date.month, start_date.day, 8, 0, 0)
py_end_date = datetime(today.year, today.month, today.day, 8, 0, 0)
elapsed_seconds = (py_end_date - py_start_date).total_seconds()
start_date = py_start_date.strftime("%Y-%m-%dT%H:%M:%S")
end_date = py_end_date.strftime("%Y-%m-%dT%H:%M:%S")
return start_date, end_date, elapsed_seconds
def gpus_per_job(tres: str) -> int:
"""Return the number of allocated GPUs."""
gpus = re.findall(r"gres/gpu=\d+", tres)
return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0
def get_data_from_sacct(clusters: str,
start_date: str,
end_date: str,
partitions: str,
fields: str) -> pd.DataFrame:
"""Return a dataframe of the sacct output."""
cmd = f"sacct -M {clusters} -a -X -P -n -S {start_date} -E {end_date} {partitions} -o {fields}"
output = subprocess.run(cmd,
stdout=subprocess.PIPE,
shell=True,
timeout=100,
text=True,
check=True)
rows = [row.split("|") for row in output.stdout.split()]
df = pd.DataFrame(rows)
df.columns = fields.split(",")
return df
def send_email_html(text, addressee, subject="GPU Usage", sender="[email protected]"):
"""Send an email in HTML."""
msg = EmailMessage()
msg['Subject'] = subject
msg['From'] = sender
msg['To'] = addressee
html = '<html><head></head><body>'
html += f'<font face="Courier New, Courier, monospace"><pre>{text}</pre></font></body></html>'
msg.set_content(html, subtype="html")
with smtplib.SMTP('localhost') as s:
s.send_message(msg)
def format_output(d1: str, d2: str, pct: str, N: int, G: int, url: str) -> str:
"""Prepare the results for the email message."""
msg = f"Start: {d1}\n"
msg += f" End: {d2}\n"
msg += f" GPUs: {G}\n"
msg += f"{url}\n\n"
s = f"GPU usage = {pct} (previous {N} days)"
msg += "=" * len(s) + "\n"
msg += s + "\n"
msg += "=" * len(s)
return msg
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Cryoem GPU Usage')
parser.add_argument('-M', '--clusters', type=str, default="della",
help='Specify cluster(s) (e.g., --clusters=della,traverse)')
parser.add_argument('-r', '--partition', type=str, default="cryoem",
help='Specify partition(s) (e.g., --partition=gpu,mig)')
parser.add_argument('-e', '--email', type=str, default=None,
help='Email address of the recipient')
parser.add_argument('-s', '--subject', type=str, default="Cryoem GPU Usage",
help='Subject of the email')
parser.add_argument('--days',
type=int,
default=14,
metavar='N',
help='Start date is N previous days from today (default: 14)')
parser.add_argument('--gpus',
type=int,
default=152,
metavar='N',
choices=range(1, 1000),
help='Maximum number of GPUs available (default: 152)')
parser.add_argument('--no-correction', action='store_true', default=False,
help='Do not apply correction to only include usage during time window and not before')
args = parser.parse_args()
# convert slurm timestamps to seconds
os.environ["SLURM_TIME_FORMAT"] = "%s"
start_date, end_date, elapsed_seconds = get_time_window(args.days)
partitions = f"-r {args.partition}"
fields = "alloctres,elapsedraw,start"
df = get_data_from_sacct(args.clusters, start_date, end_date, partitions, fields)
# clean elapsedraw field
df = df[pd.notna(df.elapsedraw)]
df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]
# clean start field
df = df[pd.notna(df.start)]
df = df[df.start.str.isnumeric()]
df.start = df.start.astype("int64")
# apply correction to only include the usage during the time window and not before
# the start of the window
if not args.no_correction:
start_dt = datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S")
df["secs-from-start"] = df["start"] - start_dt.timestamp()
df["secs-from-start"] = df["secs-from-start"].apply(lambda x: x if x < 0 else 0)
df["elapsedraw"] = df["elapsedraw"] + df["secs-from-start"]
# add columns
df["gpus"] = df.alloctres.apply(gpus_per_job)
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')
max_gpu_seconds = elapsed_seconds * args.gpus
used_over_available = df["gpu-seconds"].sum() / max_gpu_seconds
percent_usage = f"{round(100 * used_over_available)}%"
# prepare email message
url = "tiger: /home/jdh4/bin/cryoem/cryoem_gpu_usage.py"
msg = format_output(start_date, end_date, percent_usage, args.days, args.gpus, url)
send_email_html(msg, "[email protected]", subject=args.subject)
if args.email:
send_email_html(msg, args.email, subject=args.subject)