Would like to move to to https://github.com/rug-cit-hpc/pg-playbooks but has large files...
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
3.1 KiB

#!/bin/env python3
from collections import namedtuple
import datetime
import json
import requests
import subprocess
import time
def line_to_job(line):
Convert a sring containing <job-id>,<h-mm:ss>
to a named tuple, containing id and seconds.
line (string): line from squeue
Returns: Job
Job = namedtuple('Job', ['id', 'delta', 'node', 'user'])
params = line.split(',')
timestring = params[1]
if '-' in timestring:
days, remainder = timestring.split('-')
days = int(days)
days = 0
remainder = timestring
values = [0, 0, 0] + [int(i) for i in remainder.split(':')]
hours, minutes, seconds = values[-3:]
delta = datetime.timedelta(
days=days, hours=hours, minutes=minutes, seconds=seconds)
params[1] = delta
return Job(*params)
def get_gpu_usage(node, start, end):
Calculate the average GPU usage between begin and end stamps.
node (string): The GPU node
start (int): start of measurements timestamp.
end (int): end of measurements timestamp.
payload = {
data = requests.get(
'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload)
values = json.loads(data.content.decode())['data']['result'][0]['values']
average = sum([int(i[1]) for i in values]) / len(values)
return average
def gpu_load(job):
Calculate the GPU load of a RUNNING job.
end = time.time()
start = end - job.delta.total_seconds()
gpu_usage = get_gpu_usage(job.node, start, end)
return gpu_usage
def post_to_slack(message: str, slack_url: str):
Post a message to slack.
message (str): Message to post
slack_url (str): url to post message to
data = json.dumps({
'channel': '#peregrine-alerts',
'username': 'kill-hogs',
'text': message,
'icon_emoji': ':scales:'
response = requests.post(
slack_url, data=data, headers={'Content-Type': 'application/json'})
logging.info('Posting to slack')
logging.info(str(response.status_code) + str(response.text))
def main():
jobs = subprocess.check_output([
'squeue', '--partition', 'gpu', '--state=R', '-h', '-o', '%i,%M,%N,%u'
jobs = [line_to_job(l) for l in jobs.decode().split('\n') if l != '']
long_jobs = filter(lambda j: j.delta.total_seconds() > 3600, jobs)
message = []
for job in long_jobs:
gpu_usage = gpu_load(job)
job_info = 'Job id: {job.id:10} User: {job.user:9} Gpu usage: {gpu_usage:5.1f}'
if gpu_usage == 0.0:
slack_url = 'https://hooks.slack.com/services/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'
if message != []:
post_to_slack(message='\n'.join(message), slack_url=slack_url)
if __name__ == "__main__":