Browse Source

Added more appropriate message

to be displayed after sudo is typed.
Found out, after recompiling of sudo, that no recompilation is needed
;-)
pull/14/head
Egon Rijpkema 2 years ago
parent
commit
fb2d12f334
  1. 19
      sudo_lecture.yml
  2. 113
      warn_gpu_not_used

19
sudo_lecture.yml

@ -0,0 +1,19 @@
---
- hosts: login
become: true
tasks:
- name: create lecture file
copy:
dest: /etc/sudoers.lecture
content: |
Sudo is only possible for administrators on this system
If you typed sudo because it is in some tutorial,
Please read the wiki about our cluster first
https://wiki.hpc.rug.nl/peregrine/getting_started/module_environment
- name: Set sudo to use custom lecture file.
lineinfile:
path: /etc/sudoers
line: Defaults lecture_file = /etc/sudoers.lecture
state: present
insertafter: Defaults

113
warn_gpu_not_used

@ -0,0 +1,113 @@
#!/bin/env python3
from collections import namedtuple
import datetime
import json
import requests
import subprocess
import time
def line_to_job(line):
"""
Convert a sring containing <job-id>,<h-mm:ss>
to a named tuple, containing id and seconds.
Args:
line (string): line from squeue
Returns: Job
"""
Job = namedtuple('Job', ['id', 'delta', 'node', 'user'])
params = line.split(',')
timestring = params[1]
if '-' in timestring:
days, remainder = timestring.split('-')
days = int(days)
else:
days = 0
remainder = timestring
values = [0, 0, 0] + [int(i) for i in remainder.split(':')]
hours, minutes, seconds = values[-3:]
delta = datetime.timedelta(
days=days, hours=hours, minutes=minutes, seconds=seconds)
params[1] = delta
return Job(*params)
def get_gpu_usage(node, start, end):
"""
Calculate the average GPU usage between begin and end stamps.
Args:
node (string): The GPU node
start (int): start of measurements timestamp.
end (int): end of measurements timestamp.
"""
payload = {
'query':
'utilization_gpu{{env="peregrine",instance="{}:9101",job="gpu"}}'.
format(node),
'start':
start,
'end':
end,
'step':
'60s'
}
data = requests.get(
'https://knyft.hpc.rug.nl:9091/api/v1/query_range', params=payload)
values = json.loads(data.content.decode())['data']['result'][0]['values']
average = sum([int(i[1]) for i in values]) / len(values)
return average
def gpu_load(job):
"""
Calculate the GPU load of a RUNNING job.
"""
end = time.time()
start = end - job.delta.total_seconds()
gpu_usage = get_gpu_usage(job.node, start, end)
return gpu_usage
def post_to_slack(message: str, slack_url: str):
"""
Post a message to slack.
Args:
message (str): Message to post
slack_url (str): url to post message to
"""
data = json.dumps({
'channel': '#peregrine-alerts',
'username': 'kill-hogs',
'text': message,
'icon_emoji': ':scales:'
}).encode('utf-8')
response = requests.post(
slack_url, data=data, headers={'Content-Type': 'application/json'})
logging.info('Posting to slack')
logging.info(str(response.status_code) + str(response.text))
def main():
jobs = subprocess.check_output([
'squeue', '--partition', 'gpu', '--state=R', '-h', '-o', '%i,%M,%N,%u'
])
jobs = [line_to_job(l) for l in jobs.decode().split('\n') if l != '']
long_jobs = filter(lambda j: j.delta.total_seconds() > 3600, jobs)
message = []
for job in long_jobs:
gpu_usage = gpu_load(job)
job_info = 'Job id: {job.id:10} User: {job.user:9} Gpu usage: {gpu_usage:5.1f}'
print(job_info)
if gpu_usage == 0.0:
message.append(job_info)
slack_url = 'https://hooks.slack.com/services/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'
if message != []:
post_to_slack(message='\n'.join(message), slack_url=slack_url)
if __name__ == "__main__":
main()
Loading…
Cancel
Save