Browse Source

Add support for jobs using multiple GPUs.

feature/gpu-detector
Egon Rijpkema 3 years ago
parent
commit
08963ce123
  1. 66
      warn_gpu_not_used

66
warn_gpu_not_used

@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
from collections import namedtuple
import datetime
import json
import pynumparser
import re
import requests
import smtplib
@ -60,6 +61,32 @@ def line_to_job(line): @@ -60,6 +61,32 @@ def line_to_job(line):
return None
def parse_gpu_string(node_string):
"""
Parses a string in the format of.
pg-gpu[1-3] or pg-gpu[2,4,5]
"""
match = re.search('(.+\[)([0-9]|-|,)+?(?=\])', node_string)
if match is None:
return [node_string]
base, sequence = match.group().split('[')
parser = pynumparser.NumberSequence(int)
return ['{}{:02d}'.format(base, i) for i in parser.parse(sequence)]
def get_gpus_usage(nodes, start, end):
"""
Calculate the average GPU usage between begin and end stamps.
of a sequence of gpus.
Args:
nodes (string): The GPU node(s) in slurm format.
start (int): start of measurements timestamp.
end (int): end of measurements timestamp.
Returns: List: A list of tuples [(<hostname>, <percentage>)]
"""
usages = [(gpu, get_gpu_usage(gpu, start, end)) for gpu in parse_gpu_string(nodes)]
return usages
def get_gpu_usage(node, start, end):
"""
@ -80,11 +107,15 @@ def get_gpu_usage(node, start, end): @@ -80,11 +107,15 @@ def get_gpu_usage(node, start, end):
'step':
'60s'
}
try:
data = requests.get(
'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload)
values = json.loads(data.content.decode())['data']['result'][0]['values']
average = sum([int(i[1]) for i in values]) / len(values)
except:
import ipdb; ipdb.set_trace()
data = requests.get(
'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload)
values = json.loads(data.content.decode())['data']['result'][0]['values']
average = sum([int(i[1]) for i in values]) / len(values)
return average
@ -94,8 +125,7 @@ def gpu_load(job): @@ -94,8 +125,7 @@ def gpu_load(job):
"""
end = time.time()
start = end - job.delta.total_seconds()
gpu_usage = get_gpu_usage(job.node, start, end)
return gpu_usage
return get_gpus_usage(job.node, start, end)
def post_to_slack(message: str, slack_url: str):
@ -172,17 +202,19 @@ def main(): @@ -172,17 +202,19 @@ def main():
long_jobs = filter(lambda j: j.delta.total_seconds() > 3600, jobs)
message = []
for job in long_jobs:
gpu_usage = gpu_load(job)
job_info = f'Job id: {job.id:10} User: {job.user:9} Gpu usage: {gpu_usage:5.1f}'
print(job_info)
if gpu_usage == 0.0:
message.append(job_info)
subprocess.check_output(['scancel', str(job.id)])
send_mail(
sender='root@peregrine.hpc.rug.nl',
receiver=find_email(job.user),
message=EMAIL_MESSAGE + job_info)
gpus_usage = gpu_load(job)
for entry in gpus_usage:
gpu, usage = entry
job_info = f'Job id: {job.id:10} User: {job.user:9} Gpu usage: {usage:5.1f} ({gpu})'
print(job_info)
if usage == 0.0:
message.append(job_info)
subprocess.check_output(['scancel', str(job.id)])
send_mail(
sender='root@peregrine.hpc.rug.nl',
receiver=find_email(job.user),
message=EMAIL_MESSAGE + job_info)
slack_url = 'https://hooks.slack.com/services/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'
if message != []:

Loading…
Cancel
Save