Browse Source

Added slack and cpu percentage.

pull/10/head
Egon Rijpkema 3 years ago
parent
commit
3bea4c3b29
  1. 114
      roles/kill_memory_hogs/files/kill_hoggs.py
  2. 10
      roles/kill_memory_hogs/files/kill_hoggs.yml
  3. 26
      roles/kill_memory_hogs/tasks/main.yml

114
roles/kill_memory_hogs/files/kill_hoggs.py

@ -1,41 +1,99 @@ @@ -1,41 +1,99 @@
#!/usr/bin/env python3
from collections import defaultdict
import argparse
import json
import logging
import psutil
from collections import defaultdict
import requests
import time
import yaml
def post_to_slack(message: str):
"""
Post a message to slack.
Args:
message (str): Message to post
"""
with open('/opt/kill_hogs/kill_hogs.yml', 'r') as f:
config = yaml.load(f.read())
slack_url = config['slack_url']
data = json.dumps({
'channel': '#peregrine-alerts',
'username': 'kill-hoggs',
'text': message,
'icon_emoji': ':scales:'
}).encode('utf-8')
response = requests.post(
slack_url, data=data, headers={'Content-Type': 'application/json'})
logging.info('Posting to slack')
logging.info(str(response.status_code) + str(response.text))
def on_terminate(proc):
"""
Callback for terminate()
"""
print('process {} terminated with exit code {}'.format(
logging.info('process {} terminated with exit code {}'.format(
proc, proc.returncode))
def terminate(kill_list):
"""
Terminate processes. Kill if terminate is unsuccesful.
Args:
kill_list (list): List of processes to kill.
"""
for proc in kill_list:
proc.terminate()
gone, alive = psutil.wait_procs(
kill_list, timeout=3, callback=on_terminate)
for proc in alive:
print('Killing {} with signal 9'.format(proc))
logging.info('Killing {} with signal 9'.format(proc))
proc.kill()
def kill_hogs(threshold, dummy=False):
def kill_hogs(memory_threshold,
cpu_threshold,
dummy: bool = False,
slack: bool = False,
interval: float = .3):
"""
Kill all processes of a user using more than <threshold> % of memory.
Kill all processes of a user using more than <threshold> % of memory. And cpu.
For efficiency reasons only processes using more than .1 % of the available
resources are counted.
Args:
memory_threshold (float): Percentage of user resources above which to kill.
cpu_threshold (float): Percentage of user resources above which to kill.
dummy (bool): If true, do not actually kill processes.
slack (bool): send messages to slack.
"""
users = defaultdict(lambda: {'memory_percent': 0, 'processes': []})
users = defaultdict(lambda: {'cpu_percent': 0, 'memory_percent': 0, 'processes': []})
for proc in psutil.process_iter():
if proc.uids().real == 0 or proc.memory_percent() < .1:
procs = list(psutil.process_iter())
for proc in procs:
try:
proc.cpu_percent()
except Exception as e:
logging.exception(e)
time.sleep(interval)
for proc in procs:
try:
# First call t0 cpu_percent() without blocking interval is meaningless.
# see https://psutil.readthedocs.io/en/latest/
proc_cpu_percent = proc.cpu_percent()
except Exception as e:
logging.exception(e)
proc_cpu_percent = 0
if proc.uids().real == 0 or (proc.memory_percent() < .1
and proc_cpu_percent < 1):
continue # do not kill root processes.
# Check username here. It is somewhat expensive.
username = proc.username()
@ -44,31 +102,53 @@ def kill_hogs(threshold, dummy=False): @@ -44,31 +102,53 @@ def kill_hogs(threshold, dummy=False):
continue # we only kill processes of p, s and f accounts.
users[username]['memory_percent'] += proc.memory_percent()
users[username]['cpu_percent'] += proc_cpu_percent
users[username]['processes'].append(proc)
for username, data in users.items():
if data['memory_percent'] > threshold:
print('User {} uses {:.2f} % of memory. '
'The following processes will be killed:'.format(
username, data['memory_percent']))
if data['memory_percent'] > memory_threshold or data['cpu_percent'] > cpu_threshold:
message = [
'User {} uses {:.2f} % of cpu. '.format(
username, data['cpu_percent']),
'User {} uses {:.2f} % of memory. '.format(
username, data['memory_percent']),
'The following processes will be killed:'
]
for proc in data['processes']:
print('{} pid {} {} {:.2f}%'.format(proc.username(), proc.pid,
proc.name(),
proc.memory_percent()))
message.append('{} pid {} {} memory {:.2f}% cpu {:.2f}%'.format(
proc.username(), proc.pid, proc.name(),
proc.memory_percent(),
proc_cpu_percent))
logging.info('\n'.join(message))
if slack:
post_to_slack('\n'.join(message))
if not dummy:
terminate(data['processes'])
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--threshold",
"--memory_threshold",
type=float,
default=10,
help="memory percentage above which processes are killed")
parser.add_argument(
"--cpu_threshold",
type=float,
default=10,
help="cpu percentage above which processes are killed")
parser.add_argument(
"--dummy",
action='store_true',
help="Only display what would be killed")
parser.add_argument(
"--slack", action='store_true', help="Post messages to slack")
args = parser.parse_args()
kill_hogs(threshold=args.threshold, dummy=args.dummy)
kill_hogs(
memory_threshold=args.memory_threshold,
cpu_threshold=args.cpu_threshold,
dummy=args.dummy,
slack=args.slack)

10
roles/kill_memory_hogs/files/kill_hoggs.yml

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
$ANSIBLE_VAULT;1.1;AES256
39663261313566383536653833343361656637623431663261386165646131326538396434356566
3030646561336333666362323238393631343236313434350a646434363161383330396634323066
61353861613330306666366263393131333639393734313130303531343264303165343365336237
3632646162333639370a636239353066316661326230616233643332666661656637383661363466
33666330343963396135666530383036306131376565343438363761633838633233616334383164
62383930613262613566643438326530343262623433353130613866373336326363346236383564
63616635623239613732313666653936663033646532623864313336656662626561613938623838
36366234343264396238343265343363363062343063313738396138393335383437653462626366
6534

26
roles/kill_memory_hogs/tasks/main.yml

@ -1,15 +1,37 @@ @@ -1,15 +1,37 @@
---
- name: Install yum dependencies
yum:
state: latest
update_cache: yes
disable_gpg_check: yes
name:
- python36-requests
- python36-psutil
- name: make directory for script and settings.
file:
path: /opt/kill_hoggs
mode: 0777
state: directory
- name: Install python script
copy:
src: 'files/kill_hoggs.py'
dest: /sbin/kill_hogs.py
dest: /opt/kill_hoggs/kill_hoggs.py
mode: 0700
- name: Install settings file
copy:
src: 'files/kill_hoggs.yml'
dest: /opt/kill_hoggs/kill_hoggs.yml
mode: 0600
- cron:
name: Kill jobs that are using excessive memory.
minute: '/10'
user: root
job: '/usr/bin/python36 /sbin/kill_hogs.py --dummy'
job: '/usr/bin/python36 /opt/kill_hoggs/kill_hoggs.py --slack --dummy'
cron_file: kill_memory_hogs
- cronvar:

Loading…
Cancel
Save