Browse Source

Merge branch 'feature/kill_hoggs_unittests'

pull/11/head
Egon Rijpkema 3 years ago
parent
commit
cd5527cd55
  1. 13
      roles/kill_memory_hogs/files/Pipfile
  2. 188
      roles/kill_memory_hogs/files/kill_hoggs.py
  3. 10
      roles/kill_memory_hogs/files/kill_hoggs.yml
  4. 16
      roles/kill_memory_hogs/files/kill_hogs.yml
  5. 49
      roles/kill_memory_hogs/files/test_kill_hoggs.py
  6. 24
      roles/kill_memory_hogs/tasks/main.yml

13
roles/kill_memory_hogs/files/Pipfile

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[packages]
pyaml = "*"
requests = "*"
[dev-packages]
[requires]
python_version = "3.7"

188
roles/kill_memory_hogs/files/kill_hoggs.py

@ -1,188 +0,0 @@ @@ -1,188 +0,0 @@
#!/usr/bin/env python3
from collections import defaultdict
import argparse
import json
import logging
import psutil
import requests
import subprocess
import time
import yaml
def post_to_slack(message: str):
"""
Post a message to slack.
Args:
message (str): Message to post
"""
with open('/opt/kill_hoggs/kill_hoggs.yml', 'r') as f:
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
slack_url = config['slack_url']
data = json.dumps({
'channel': '#peregrine-alerts',
'username': 'kill-hoggs',
'text': message,
'icon_emoji': ':scales:'
}).encode('utf-8')
response = requests.post(
slack_url, data=data, headers={'Content-Type': 'application/json'})
logging.info('Posting to slack')
logging.info(str(response.status_code) + str(response.text))
def send_message_to_terminals(user: str, message: str):
"""
Sends <message> to all terminals on which <user> is logged in.
"""
terminals = find_terminals_of_user(user)
for terminal in terminals:
subprocess.run(
'echo "{message}" | write {user} {terminal}'.format(
message=message, user=user, terminal=terminal),
shell=True)
def find_terminals_of_user(user: str):
"""
Args:
user (str): The user who's terminals to return.
Returns:
list: A list of terminals (string)
"""
terminals = subprocess.run(
'w -s -h', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return [
t.split()[1]
for t in str(terminals.stdout).strip('b\'').strip('').split('\\n')
if user in t
]
def on_terminate(proc):
"""
Callback for terminate()
"""
logging.info('process {} terminated with exit code {}'.format(
proc, proc.returncode))
def terminate(kill_list):
"""
Terminate processes. Kill if terminate is unsuccesful.
Args:
kill_list (list): List of processes to kill.
"""
for proc in kill_list:
proc.terminate()
gone, alive = psutil.wait_procs(
kill_list, timeout=3, callback=on_terminate)
for proc in alive:
logging.info('Killing {} with signal 9'.format(proc))
proc.kill()
def kill_hogs(memory_threshold,
cpu_threshold,
dummy: bool = False,
slack: bool = False,
interval: float = .3,
warning: str = ''):
"""
Kill all processes of a user using more than <threshold> % of memory. And cpu.
For efficiency reasons only processes using more than .1 % of the available
resources are counted.
Args:
memory_threshold (float): Percentage of user resources above which to kill.
cpu_threshold (float): Percentage of user resources above which to kill.
dummy (bool): If true, do not actually kill processes.
slack (bool): send messages to slack.
"""
users = defaultdict(lambda: {'cpu_percent': 0, 'memory_percent': 0, 'processes': []})
procs = list(psutil.process_iter())
for proc in procs:
try:
proc.cpu_percent()
except (psutil.NoSuchProcess, FileNotFoundError) as e:
pass
time.sleep(interval)
for proc in procs:
try:
# First call of cpu_percent() without blocking interval is meaningless.
# see https://psutil.readthedocs.io/en/latest/
proc.cached_cpu_percent = proc.cpu_percent()
proc.cached_memory_percent = proc.memory_percent()
if proc.uids().real == 0 or (proc.cached_memory_percent < .1
and proc.cached_cpu_percent < 1):
continue # do not kill root processes.
# Check username here. It is somewhat expensive.
username = proc.username()
if not ((username[0] in ('s', 'p', 'f', 'g') and username[1:].isdigit())
or username[:5] == 'umcg-'):
continue # we only kill processes of p, s, f and g accounts
users[username]['memory_percent'] += proc.cached_memory_percent
users[username]['cpu_percent'] += proc.cached_cpu_percent
users[username]['processes'].append(proc)
except (psutil.NoSuchProcess, FileNotFoundError) as e:
pass
for username, data in users.items():
if data['memory_percent'] > memory_threshold or data['cpu_percent'] > cpu_threshold:
message = [
'User {} uses \n {:.2f} % of cpu. '.format(
username, data['cpu_percent']),
'{:.2f} % of memory. '.format(data['memory_percent']),
'The following processes will be killed:'
]
for proc in data['processes']:
message.append(
'{} pid {} {} memory {:.2f}% cpu {:.2f}%'.format(
proc.username(), proc.pid, proc.name(),
proc.cached_memory_percent, proc.cached_cpu_percent))
logging.info('\n'.join(message))
if warning == '':
warning = """Please submit your processes as a job.
Your processes have been killed and this incident has been reported.
For more information, see https://redmine.hpc.rug.nl/redmine/projects/peregrine/wiki/FAQ"""
send_message_to_terminals(proc.username(), warning)
if slack:
post_to_slack('\n'.join(message))
if not dummy:
terminate(data['processes'])
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--memory_threshold",
type=float,
default=10,
help="memory percentage above which processes are killed")
parser.add_argument(
"--cpu_threshold",
type=float,
default=600,
help="cpu percentage above which processes are killed")
parser.add_argument(
"--dummy",
action='store_true',
help="Only display what would be killed")
parser.add_argument(
"--slack", action='store_true', help="Post messages to slack")
args = parser.parse_args()
kill_hogs(
memory_threshold=args.memory_threshold,
cpu_threshold=args.cpu_threshold,
dummy=args.dummy,
slack=args.slack)

10
roles/kill_memory_hogs/files/kill_hoggs.yml

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
$ANSIBLE_VAULT;1.1;AES256
39663261313566383536653833343361656637623431663261386165646131326538396434356566
3030646561336333666362323238393631343236313434350a646434363161383330396634323066
61353861613330306666366263393131333639393734313130303531343264303165343365336237
3632646162333639370a636239353066316661326230616233643332666661656637383661363466
33666330343963396135666530383036306131376565343438363761633838633233616334383164
62383930613262613566643438326530343262623433353130613866373336326363346236383564
63616635623239613732313666653936663033646532623864313336656662626561613938623838
36366234343264396238343265343363363062343063313738396138393335383437653462626366
6534

16
roles/kill_memory_hogs/files/kill_hogs.yml

@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
$ANSIBLE_VAULT;1.1;AES256
38353638343531646539343066616364303531313933653965353563343962353033303563666361
3838326366316133396162626334363534356562333539340a633863343530353961646630613030
61666434346130306666646565386239353936326462383763623934303861373461313838346364
6238653639336539340a626138633033356434636533323539366166633066623736623334623232
63323935376238303130383130326264653166666666393238323630333963316261643939333961
34636331383732326638373331633730316238666565373438656566353836316637306362653136
61643561373266373963363435386333353837326464386331336136323135376339343734323536
32333132383632653231326562613533333764323133626435646235656139323665613834323838
36393163356233323739306436396435326437666233353034373334366337313564366431393363
65633166643430366562646534396135363032303632326365333834643039306539636639613138
64656134376264633638313031306566363933326162326136303534656262303062373139613566
63646636636539653462306361633830633730643563613865643333343962333363306239383564
62393363343139633535306266616435363064626434306639616565656633626564613536613963
30636530336363623635656162653836616435643138363962353266353263383530336535636136
373539323962633634383062393064303930

49
roles/kill_memory_hogs/files/test_kill_hoggs.py

@ -1,49 +0,0 @@ @@ -1,49 +0,0 @@
import unittest
from unittest import mock
import kill_hoggs
def mocked_requests_post(*args, **kwargs):
"""
Adapted from an answer here:
https://stackoverflow.com/questions/15753390/how-can-i-mock-requests-and-the-response
"""
class MockResponse:
def __init__(self, json_data, status_code):
self.json_data = json_data
self.status_code = status_code
self.text = "MOcked successfully."
def json(self):
return self.json_data
if args[0] == 'https://hooks.slack.com/services/normally/random/string':
return MockResponse({"key1": "value1"}, 200)
return MockResponse(None, 404)
dummy_config = '''
---
slack_url: 'https://hooks.slack.com/services/normally/random/string'
'''
class PostToSlackTestcase(unittest.TestCase):
# We patch 'requests.get' with our own method. The mock object is passed in to our test case method.
@mock.patch('requests.post', side_effect=mocked_requests_post)
@mock.patch('builtins.open', mock.mock_open(read_data=dummy_config))
def test_post_to_slack(self, mock_get):
"""
Call post_to_slack and make sure requests.post was called with the right parameters.
"""
kill_hoggs.post_to_slack('Hello world')
# assert that our mocked function was called with the right parameters
self.assertIn(mock_get.call_args_list[0], mock_get.call_args_list)
self.assertIn(mock.call('https://hooks.slack.com/services/normally/random/string', data=b'{"channel": "#peregrine-alerts", "username": "kill-hoggs", "text": "Hello world", "icon_emoji": ":scales:"}', headers={'Content-Type': 'application/json'}), mock_get.call_args_list)
if __name__ == '__main__':
unittest.main()

24
roles/kill_memory_hogs/tasks/main.yml

@ -8,28 +8,22 @@ @@ -8,28 +8,22 @@
- python36-requests
- python36-psutil
- name: make directory for script and settings.
file:
path: /opt/kill_hoggs
mode: 0777
state: directory
- git:
repo: 'git@github.com:rug-cit-hpc/kill-hogs.git'
dest: /opt/kill_hogs
version: master
update: yes
- name: Install python script
copy:
src: 'files/kill_hoggs.py'
dest: /opt/kill_hoggs/kill_hoggs.py
mode: 0700
- name: Install settings file
copy:
src: 'files/kill_hoggs.yml'
dest: /opt/kill_hoggs/kill_hoggs.yml
src: 'files/kill_hogs.yml'
dest: /opt/kill_hogs/kill_hogs.yml
mode: 0600
- cron:
name: Kill jobs that are using excessive memory.
minute: '*/10'
minute: '*/2'
user: root
job: '/usr/bin/python36 /opt/kill_hoggs/kill_hoggs.py --slack'
job: '/usr/bin/python36 /opt/kill_hogs/kill_hogs.py --slack'
cron_file: kill_memory_hogs

Loading…
Cancel
Save