Browse Source

Moved kill hogs to a separate repo.

Added unit tests

changed user selection into regexp.

Added another test and coverage.

Added more unit tests.

Added another test and coverage.

It's hogs not hoggs.

Made the user filter a configuration setting.

anonymized the p numbers

Updated the user selection regexp

anonymize output of w -s -h

Moved kill hogs to separate repo.

Put settings file back
pull/11/head
Egon Rijpkema 3 years ago
parent
commit
08751862ae
  1. 13
      roles/kill_memory_hogs/files/Pipfile
  2. 188
      roles/kill_memory_hogs/files/kill_hoggs.py
  3. 10
      roles/kill_memory_hogs/files/kill_hoggs.yml
  4. 16
      roles/kill_memory_hogs/files/kill_hogs.yml
  5. 49
      roles/kill_memory_hogs/files/test_kill_hoggs.py
  6. 24
      roles/kill_memory_hogs/tasks/main.yml

13
roles/kill_memory_hogs/files/Pipfile

@ -1,13 +0,0 @@ @@ -1,13 +0,0 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[packages]
pyaml = "*"
requests = "*"
[dev-packages]
[requires]
python_version = "3.7"

188
roles/kill_memory_hogs/files/kill_hoggs.py

@ -1,188 +0,0 @@ @@ -1,188 +0,0 @@
#!/usr/bin/env python3
from collections import defaultdict
import argparse
import json
import logging
import psutil
import requests
import subprocess
import time
import yaml
def post_to_slack(message: str):
"""
Post a message to slack.
Args:
message (str): Message to post
"""
with open('/opt/kill_hoggs/kill_hoggs.yml', 'r') as f:
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
slack_url = config['slack_url']
data = json.dumps({
'channel': '#peregrine-alerts',
'username': 'kill-hoggs',
'text': message,
'icon_emoji': ':scales:'
}).encode('utf-8')
response = requests.post(
slack_url, data=data, headers={'Content-Type': 'application/json'})
logging.info('Posting to slack')
logging.info(str(response.status_code) + str(response.text))
def send_message_to_terminals(user: str, message: str):
"""
Sends <message> to all terminals on which <user> is logged in.
"""
terminals = find_terminals_of_user(user)
for terminal in terminals:
subprocess.run(
'echo "{message}" | write {user} {terminal}'.format(
message=message, user=user, terminal=terminal),
shell=True)
def find_terminals_of_user(user: str):
"""
Args:
user (str): The user who's terminals to return.
Returns:
list: A list of terminals (string)
"""
terminals = subprocess.run(
'w -s -h', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return [
t.split()[1]
for t in str(terminals.stdout).strip('b\'').strip('').split('\\n')
if user in t
]
def on_terminate(proc):
"""
Callback for terminate()
"""
logging.info('process {} terminated with exit code {}'.format(
proc, proc.returncode))
def terminate(kill_list):
"""
Terminate processes. Kill if terminate is unsuccesful.
Args:
kill_list (list): List of processes to kill.
"""
for proc in kill_list:
proc.terminate()
gone, alive = psutil.wait_procs(
kill_list, timeout=3, callback=on_terminate)
for proc in alive:
logging.info('Killing {} with signal 9'.format(proc))
proc.kill()
def kill_hogs(memory_threshold,
cpu_threshold,
dummy: bool = False,
slack: bool = False,
interval: float = .3,
warning: str = ''):
"""
Kill all processes of a user using more than <threshold> % of memory. And cpu.
For efficiency reasons only processes using more than .1 % of the available
resources are counted.
Args:
memory_threshold (float): Percentage of user resources above which to kill.
cpu_threshold (float): Percentage of user resources above which to kill.
dummy (bool): If true, do not actually kill processes.
slack (bool): send messages to slack.
"""
users = defaultdict(lambda: {'cpu_percent': 0, 'memory_percent': 0, 'processes': []})
procs = list(psutil.process_iter())
for proc in procs:
try:
proc.cpu_percent()
except (psutil.NoSuchProcess, FileNotFoundError) as e:
pass
time.sleep(interval)
for proc in procs:
try:
# First call of cpu_percent() without blocking interval is meaningless.
# see https://psutil.readthedocs.io/en/latest/
proc.cached_cpu_percent = proc.cpu_percent()
proc.cached_memory_percent = proc.memory_percent()
if proc.uids().real == 0 or (proc.cached_memory_percent < .1
and proc.cached_cpu_percent < 1):
continue # do not kill root processes.
# Check username here. It is somewhat expensive.
username = proc.username()
if not ((username[0] in ('s', 'p', 'f', 'g') and username[1:].isdigit())
or username[:5] == 'umcg-'):
continue # we only kill processes of p, s, f and g accounts
users[username]['memory_percent'] += proc.cached_memory_percent
users[username]['cpu_percent'] += proc.cached_cpu_percent
users[username]['processes'].append(proc)
except (psutil.NoSuchProcess, FileNotFoundError) as e:
pass
for username, data in users.items():
if data['memory_percent'] > memory_threshold or data['cpu_percent'] > cpu_threshold:
message = [
'User {} uses \n {:.2f} % of cpu. '.format(
username, data['cpu_percent']),
'{:.2f} % of memory. '.format(data['memory_percent']),
'The following processes will be killed:'
]
for proc in data['processes']:
message.append(
'{} pid {} {} memory {:.2f}% cpu {:.2f}%'.format(
proc.username(), proc.pid, proc.name(),
proc.cached_memory_percent, proc.cached_cpu_percent))
logging.info('\n'.join(message))
if warning == '':
warning = """Please submit your processes as a job.
Your processes have been killed and this incident has been reported.
For more information, see https://redmine.hpc.rug.nl/redmine/projects/peregrine/wiki/FAQ"""
send_message_to_terminals(proc.username(), warning)
if slack:
post_to_slack('\n'.join(message))
if not dummy:
terminate(data['processes'])
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--memory_threshold",
type=float,
default=10,
help="memory percentage above which processes are killed")
parser.add_argument(
"--cpu_threshold",
type=float,
default=600,
help="cpu percentage above which processes are killed")
parser.add_argument(
"--dummy",
action='store_true',
help="Only display what would be killed")
parser.add_argument(
"--slack", action='store_true', help="Post messages to slack")
args = parser.parse_args()
kill_hogs(
memory_threshold=args.memory_threshold,
cpu_threshold=args.cpu_threshold,
dummy=args.dummy,
slack=args.slack)

10
roles/kill_memory_hogs/files/kill_hoggs.yml

@ -1,10 +0,0 @@ @@ -1,10 +0,0 @@
$ANSIBLE_VAULT;1.1;AES256
39663261313566383536653833343361656637623431663261386165646131326538396434356566
3030646561336333666362323238393631343236313434350a646434363161383330396634323066
61353861613330306666366263393131333639393734313130303531343264303165343365336237
3632646162333639370a636239353066316661326230616233643332666661656637383661363466
33666330343963396135666530383036306131376565343438363761633838633233616334383164
62383930613262613566643438326530343262623433353130613866373336326363346236383564
63616635623239613732313666653936663033646532623864313336656662626561613938623838
36366234343264396238343265343363363062343063313738396138393335383437653462626366
6534

16
roles/kill_memory_hogs/files/kill_hogs.yml

@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
$ANSIBLE_VAULT;1.1;AES256
38353638343531646539343066616364303531313933653965353563343962353033303563666361
3838326366316133396162626334363534356562333539340a633863343530353961646630613030
61666434346130306666646565386239353936326462383763623934303861373461313838346364
6238653639336539340a626138633033356434636533323539366166633066623736623334623232
63323935376238303130383130326264653166666666393238323630333963316261643939333961
34636331383732326638373331633730316238666565373438656566353836316637306362653136
61643561373266373963363435386333353837326464386331336136323135376339343734323536
32333132383632653231326562613533333764323133626435646235656139323665613834323838
36393163356233323739306436396435326437666233353034373334366337313564366431393363
65633166643430366562646534396135363032303632326365333834643039306539636639613138
64656134376264633638313031306566363933326162326136303534656262303062373139613566
63646636636539653462306361633830633730643563613865643333343962333363306239383564
62393363343139633535306266616435363064626434306639616565656633626564613536613963
30636530336363623635656162653836616435643138363962353266353263383530336535636136
373539323962633634383062393064303930

49
roles/kill_memory_hogs/files/test_kill_hoggs.py

@ -1,49 +0,0 @@ @@ -1,49 +0,0 @@
import unittest
from unittest import mock
import kill_hoggs
def mocked_requests_post(*args, **kwargs):
"""
Adapted from an answer here:
https://stackoverflow.com/questions/15753390/how-can-i-mock-requests-and-the-response
"""
class MockResponse:
def __init__(self, json_data, status_code):
self.json_data = json_data
self.status_code = status_code
self.text = "MOcked successfully."
def json(self):
return self.json_data
if args[0] == 'https://hooks.slack.com/services/normally/random/string':
return MockResponse({"key1": "value1"}, 200)
return MockResponse(None, 404)
dummy_config = '''
---
slack_url: 'https://hooks.slack.com/services/normally/random/string'
'''
class PostToSlackTestcase(unittest.TestCase):
# We patch 'requests.get' with our own method. The mock object is passed in to our test case method.
@mock.patch('requests.post', side_effect=mocked_requests_post)
@mock.patch('builtins.open', mock.mock_open(read_data=dummy_config))
def test_post_to_slack(self, mock_get):
"""
Call post_to_slack and make sure requests.post was called with the right parameters.
"""
kill_hoggs.post_to_slack('Hello world')
# assert that our mocked function was called with the right parameters
self.assertIn(mock_get.call_args_list[0], mock_get.call_args_list)
self.assertIn(mock.call('https://hooks.slack.com/services/normally/random/string', data=b'{"channel": "#peregrine-alerts", "username": "kill-hoggs", "text": "Hello world", "icon_emoji": ":scales:"}', headers={'Content-Type': 'application/json'}), mock_get.call_args_list)
if __name__ == '__main__':
unittest.main()

24
roles/kill_memory_hogs/tasks/main.yml

@ -8,28 +8,22 @@ @@ -8,28 +8,22 @@
- python36-requests
- python36-psutil
- name: make directory for script and settings.
file:
path: /opt/kill_hoggs
mode: 0777
state: directory
- git:
repo: 'git@github.com:rug-cit-hpc/kill-hogs.git'
dest: /opt/kill_hogs
version: master
update: yes
- name: Install python script
copy:
src: 'files/kill_hoggs.py'
dest: /opt/kill_hoggs/kill_hoggs.py
mode: 0700
- name: Install settings file
copy:
src: 'files/kill_hoggs.yml'
dest: /opt/kill_hoggs/kill_hoggs.yml
src: 'files/kill_hogs.yml'
dest: /opt/kill_hogs/kill_hogs.yml
mode: 0600
- cron:
name: Kill jobs that are using excessive memory.
minute: '*/10'
minute: '*/2'
user: root
job: '/usr/bin/python36 /opt/kill_hoggs/kill_hoggs.py --slack'
job: '/usr/bin/python36 /opt/kill_hogs/kill_hogs.py --slack'
cron_file: kill_memory_hogs

Loading…
Cancel
Save