|
|
|
@ -6,6 +6,7 @@ import json
@@ -6,6 +6,7 @@ import json
|
|
|
|
|
import logging |
|
|
|
|
import psutil |
|
|
|
|
import requests |
|
|
|
|
import subprocess |
|
|
|
|
import time |
|
|
|
|
import yaml |
|
|
|
|
|
|
|
|
@ -32,6 +33,33 @@ def post_to_slack(message: str):
@@ -32,6 +33,33 @@ def post_to_slack(message: str):
|
|
|
|
|
logging.info(str(response.status_code) + str(response.text)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def send_message_to_terminals(user: str, message: str): |
|
|
|
|
""" |
|
|
|
|
Sends <message> to all terminals on which <user> is logged in. |
|
|
|
|
""" |
|
|
|
|
terminals = find_terminals_of_user(user) |
|
|
|
|
for terminal in terminals: |
|
|
|
|
subprocess.run( |
|
|
|
|
'echo "{message}" | write {user} {terminal}'.format( |
|
|
|
|
message=message, user=user, terminal=terminal), |
|
|
|
|
shell=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_terminals_of_user(user: str): |
|
|
|
|
""" |
|
|
|
|
Args: |
|
|
|
|
user (str): The user who's terminals to return. |
|
|
|
|
Returns: |
|
|
|
|
list: A list of terminals (string) |
|
|
|
|
""" |
|
|
|
|
terminals = subprocess.run('w -s -h', shell=True, capture_output=True) |
|
|
|
|
return [ |
|
|
|
|
t.split()[1] |
|
|
|
|
for t in str(terminals.stdout).strip('b\'').strip('').split('\\n') |
|
|
|
|
if t != '' |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def on_terminate(proc): |
|
|
|
|
""" |
|
|
|
|
Callback for terminate() |
|
|
|
@ -60,7 +88,8 @@ def kill_hogs(memory_threshold,
@@ -60,7 +88,8 @@ def kill_hogs(memory_threshold,
|
|
|
|
|
cpu_threshold, |
|
|
|
|
dummy: bool = False, |
|
|
|
|
slack: bool = False, |
|
|
|
|
interval: float = .3): |
|
|
|
|
interval: float = .3, |
|
|
|
|
warning: str = ''): |
|
|
|
|
""" |
|
|
|
|
Kill all processes of a user using more than <threshold> % of memory. And cpu. |
|
|
|
|
For efficiency reasons only processes using more than .1 % of the available |
|
|
|
@ -105,23 +134,27 @@ def kill_hogs(memory_threshold,
@@ -105,23 +134,27 @@ def kill_hogs(memory_threshold,
|
|
|
|
|
users[username]['processes'].append(proc) |
|
|
|
|
except (psutil.NoSuchProcess, FileNotFoundError) as e: |
|
|
|
|
pass |
|
|
|
|
#logging.exception(e) |
|
|
|
|
|
|
|
|
|
for username, data in users.items(): |
|
|
|
|
if data['memory_percent'] > memory_threshold or data['cpu_percent'] > cpu_threshold: |
|
|
|
|
message = [ |
|
|
|
|
'User {} uses {:.2f} % of cpu. '.format( |
|
|
|
|
username, data['cpu_percent']), |
|
|
|
|
'User {} uses {:.2f} % of memory. '.format( |
|
|
|
|
username, data['memory_percent']), |
|
|
|
|
'User {} uses \n {:.2f} % of cpu. '.format( |
|
|
|
|
username, |
|
|
|
|
data['cpu_percent']), '{:.2f} % of memory. '.format( |
|
|
|
|
username, data['memory_percent']), |
|
|
|
|
'The following processes will be killed:' |
|
|
|
|
] |
|
|
|
|
for proc in data['processes']: |
|
|
|
|
message.append('{} pid {} {} memory {:.2f}% cpu {:.2f}%'.format( |
|
|
|
|
proc.username(), proc.pid, proc.name(), |
|
|
|
|
proc.cached_memory_percent, |
|
|
|
|
proc.cached_cpu_percent)) |
|
|
|
|
message.append( |
|
|
|
|
'{} pid {} {} memory {:.2f}% cpu {:.2f}%'.format( |
|
|
|
|
proc.username(), proc.pid, proc.name(), |
|
|
|
|
proc.cached_memory_percent, proc.cached_cpu_percent)) |
|
|
|
|
logging.info('\n'.join(message)) |
|
|
|
|
if warning == '': |
|
|
|
|
warning = """Please submit your processes as a job. |
|
|
|
|
Your processes have been killed and this incident has been reported. |
|
|
|
|
For more information, see https://redmine.hpc.rug.nl/redmine/projects/peregrine/wiki/FAQ""" |
|
|
|
|
send_message_to_terminals(proc.username(), warning) |
|
|
|
|
if slack: |
|
|
|
|
post_to_slack('\n'.join(message)) |
|
|
|
|
if not dummy: |
|
|
|
|