|
|
|
@ -3,6 +3,7 @@
@@ -3,6 +3,7 @@
|
|
|
|
|
from collections import namedtuple |
|
|
|
|
import datetime |
|
|
|
|
import json |
|
|
|
|
import pynumparser |
|
|
|
|
import re |
|
|
|
|
import requests |
|
|
|
|
import smtplib |
|
|
|
@ -60,6 +61,32 @@ def line_to_job(line):
@@ -60,6 +61,32 @@ def line_to_job(line):
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_gpu_string(node_string): |
|
|
|
|
""" |
|
|
|
|
Parses a string in the format of. |
|
|
|
|
pg-gpu[1-3] or pg-gpu[2,4,5] |
|
|
|
|
""" |
|
|
|
|
match = re.search('(.+\[)([0-9]|-|,)+?(?=\])', node_string) |
|
|
|
|
if match is None: |
|
|
|
|
return [node_string] |
|
|
|
|
|
|
|
|
|
base, sequence = match.group().split('[') |
|
|
|
|
parser = pynumparser.NumberSequence(int) |
|
|
|
|
return ['{}{:02d}'.format(base, i) for i in parser.parse(sequence)] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_gpus_usage(nodes, start, end): |
|
|
|
|
""" |
|
|
|
|
Calculate the average GPU usage between begin and end stamps. |
|
|
|
|
of a sequence of gpus. |
|
|
|
|
Args: |
|
|
|
|
nodes (string): The GPU node(s) in slurm format. |
|
|
|
|
start (int): start of measurements timestamp. |
|
|
|
|
end (int): end of measurements timestamp. |
|
|
|
|
Returns: List: A list of tuples [(<hostname>, <percentage>)] |
|
|
|
|
""" |
|
|
|
|
usages = [(gpu, get_gpu_usage(gpu, start, end)) for gpu in parse_gpu_string(nodes)] |
|
|
|
|
return usages |
|
|
|
|
|
|
|
|
|
def get_gpu_usage(node, start, end): |
|
|
|
|
""" |
|
|
|
@ -80,11 +107,15 @@ def get_gpu_usage(node, start, end):
@@ -80,11 +107,15 @@ def get_gpu_usage(node, start, end):
|
|
|
|
|
'step': |
|
|
|
|
'60s' |
|
|
|
|
} |
|
|
|
|
try: |
|
|
|
|
data = requests.get( |
|
|
|
|
'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload) |
|
|
|
|
|
|
|
|
|
values = json.loads(data.content.decode())['data']['result'][0]['values'] |
|
|
|
|
average = sum([int(i[1]) for i in values]) / len(values) |
|
|
|
|
except: |
|
|
|
|
import ipdb; ipdb.set_trace() |
|
|
|
|
|
|
|
|
|
data = requests.get( |
|
|
|
|
'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload) |
|
|
|
|
values = json.loads(data.content.decode())['data']['result'][0]['values'] |
|
|
|
|
average = sum([int(i[1]) for i in values]) / len(values) |
|
|
|
|
return average |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -94,8 +125,7 @@ def gpu_load(job):
@@ -94,8 +125,7 @@ def gpu_load(job):
|
|
|
|
|
""" |
|
|
|
|
end = time.time() |
|
|
|
|
start = end - job.delta.total_seconds() |
|
|
|
|
gpu_usage = get_gpu_usage(job.node, start, end) |
|
|
|
|
return gpu_usage |
|
|
|
|
return get_gpus_usage(job.node, start, end) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def post_to_slack(message: str, slack_url: str): |
|
|
|
@ -172,17 +202,19 @@ def main():
@@ -172,17 +202,19 @@ def main():
|
|
|
|
|
long_jobs = filter(lambda j: j.delta.total_seconds() > 3600, jobs) |
|
|
|
|
message = [] |
|
|
|
|
for job in long_jobs: |
|
|
|
|
gpu_usage = gpu_load(job) |
|
|
|
|
job_info = f'Job id: {job.id:10} User: {job.user:9} Gpu usage: {gpu_usage:5.1f}' |
|
|
|
|
print(job_info) |
|
|
|
|
|
|
|
|
|
if gpu_usage == 0.0: |
|
|
|
|
message.append(job_info) |
|
|
|
|
subprocess.check_output(['scancel', str(job.id)]) |
|
|
|
|
send_mail( |
|
|
|
|
sender='root@peregrine.hpc.rug.nl', |
|
|
|
|
receiver=find_email(job.user), |
|
|
|
|
message=EMAIL_MESSAGE + job_info) |
|
|
|
|
gpus_usage = gpu_load(job) |
|
|
|
|
for entry in gpus_usage: |
|
|
|
|
gpu, usage = entry |
|
|
|
|
job_info = f'Job id: {job.id:10} User: {job.user:9} Gpu usage: {usage:5.1f} ({gpu})' |
|
|
|
|
print(job_info) |
|
|
|
|
|
|
|
|
|
if usage == 0.0: |
|
|
|
|
message.append(job_info) |
|
|
|
|
subprocess.check_output(['scancel', str(job.id)]) |
|
|
|
|
send_mail( |
|
|
|
|
sender='root@peregrine.hpc.rug.nl', |
|
|
|
|
receiver=find_email(job.user), |
|
|
|
|
message=EMAIL_MESSAGE + job_info) |
|
|
|
|
|
|
|
|
|
slack_url = 'https://hooks.slack.com/services/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' |
|
|
|
|
if message != []: |
|
|
|
|