Browse Source

Added nhc and /apps

pull/12/head
Egon Rijpkema 3 years ago
parent
commit
aa545626eb
  1. 20
      apps.yml
  2. 9
      roles/slurm-client/handlers/main.yml
  3. 13
      roles/slurm-client/tasks/main.yml
  4. 208
      roles/slurm-client/templates/nhc.conf

20
apps.yml

@ -0,0 +1,20 @@ @@ -0,0 +1,20 @@
---
# Playbook that adds the /etc/fstab entry for /apps
- hosts: all
become: true
tasks:
- name: make endpoint
file:
path: /apps
mode: 0777
state: directory
- name: Add line to fstab
lineinfile:
path: /etc/fstab
line: "172.23.56.1:/software /apps nfs vers=3,rsize=8192,wsize=8192,tcp,async,lock 0 2"
- name: mount all mountpoints in fstab
command: mount -a
args:
warn: false

9
roles/slurm-client/handlers/main.yml

@ -20,6 +20,14 @@ @@ -20,6 +20,14 @@
state: reloaded
become: true
listen: restart_slurmd
- name: Restart nhc service.
systemd:
name: 'nhc.service'
state: restarted
become: true
listen: restart_nhc
#
# Service reloads after restarts.
#
@ -36,4 +44,5 @@ @@ -36,4 +44,5 @@
state: reloaded
become: true
listen: reload_slurmd
...

13
roles/slurm-client/tasks/main.yml

@ -53,6 +53,19 @@ @@ -53,6 +53,19 @@
become: true
- name: Install nhc for merlin nodes
template:
src: nhc.conf
owner: root
group: root
mode: 0644
dest: /etc/nhc/nhc.conf
notify:
- restart_nhc
become: true
when: inventory_hostname in groups['merlin-nodes']
- name: Start slurm and munge services
systemd:
name: "{{ item }}"

208
roles/slurm-client/templates/nhc.conf

@ -0,0 +1,208 @@ @@ -0,0 +1,208 @@
# NHC Configuration File (sample)
#
# Lines are in the form "<hostmask>||<check>"
# Hostmask is a glob, /regexp/, or {noderange}
# Comments begin with '#'
#
# $Id: nhc.conf 1774 2014-09-20 02:59:19Z mej $
#
#######################################################################
###
### NHC Configuration Variables
###
# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager
* || export NHC_RM=slurm
# Do not mark nodes offline
# * || export MARK_OFFLINE=0
# Activate debugging mode
# * || export DEBUG=1
# Set watchdog timer to 15 seconds
# * || export TIMEOUT=15
# In out-of-band contexts, enable all checks
# * || export NHC_CHECK_ALL=1
# Make sure $PATH contains important directories for diagnostic commands
# * || export MOABHOMEDIR="/opt/moab"
# * || export PATH="$MOABHOMEDIR/bin:$PATH"
#######################################################################
###
### Hardware checks
###
# Set these to your correct socket, core, and thread counts.
# * || check_hw_cpuinfo 2 28 28
# Set these to the amount of physical RAM you have (leave the fudge factor).
* || check_hw_physmem 176gb 176gb 5%
# Set these to the amount of swap you have (leave the fudge factor).
* || check_hw_swap 0g 0g 3%
# If you prefer to use this instead of the previous two, you can.
# * || check_hw_mem 40g 40g 5%
# Check specifically for free physical memory.
* || check_hw_physmem_free 1MB
# Same, but for swap space.
# * || check_hw_swap_free 1MB
# Check for some sort of free memory of either type.
* || check_hw_mem_free 2GB
# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp.
# * || check_hw_ib 56
# Checks for an active Myrinet interface named "myri0."
# * || check_hw_gm myri0
# Checks for an active ethernet interface named "eth0."
* || check_hw_eth eth0
# Checks for an active ethernet interface named "eth1."
* || check_hw_eth eth1
# Make sure we're running the correct BIOS version on all nodes.
# * || check_dmi_data_match "BIOS Information: Version: 1.0.4"
# Make sure our RAM is running at the correct bus rate.
# * || check_dmi_data_match -t "Memory Device" "*Speed: 2133 MHz"
# Check the mcelog daemon for any pending errors.
* || check_hw_mcelog
#######################################################################
###
### Filesystem checks
###
# All nodes should have their root filesystem mounted read/write.
* || check_fs_mount_rw -f /
# Assert that /tmp is a mounted filesystem of type "tmpfs."
# * || check_fs_mount_rw -t tmpfs -f /tmp
# Controlling TTYs are a good thing!
* || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts
# Make sure the root filesystem doesn't get too full.
* || check_fs_free / 3%
# Free inodes are also important.
* || check_fs_ifree / 1k
# The following illustrates how to assert an NFSv3 mount (or any other specific mount option).
# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home
* || check_fs_mount -s 172.23.56.1:/software -t nfs -o '/(^|,)vers=3(,|$)/' -f /apps
* || check_fs_mount -t lustre -f /home
* || check_fs_mount -t lustre -f /data
* || check_fs_mount -t lustre -f /scratch
#######################################################################
###
### File/metadata checks
###
# These should always be directories and always be read/write/execute and sticky.
* || check_file_test -r -w -x -d -k /tmp /var/tmp
# These should always be readable and should never be empty.
* || check_file_test -r -s /etc/passwd /etc/group
# Assert common properties for /dev/null (which occasionally gets clobbered).
* || check_file_test -c -r -w /dev/null /dev/zero
* || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null
# Make sure there's relatively recent activity from the syslog.
# * || check_file_stat -n 7200 /var/log/messages
# Validate a couple important accounts in the passwd file.
* || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*"
#######################################################################
###
### Process checks
###
# Everybody needs sshd running, right? But don't use -r (restart)!
* || check_ps_service -u root -S sshd
# The cron daemon is another useful critter...
* || check_ps_service -r crond
# Check for wulfd but don't manage it.
# * || check_ps_daemon wulfd root
# Make sure no users are SSH'd in, but don't kill them.
# * || check_ps_blacklist sshd '!root'
# Flag and kill any processes which are owned by unauthorized users.
# * || check_ps_unauth_users log syslog kill
# Flag any user processes not properly parented.
# * || check_ps_userproc_lineage log syslog
# Most systems also need NFS locking services.
# * || check_ps_service -d rpc.statd -r nfslock
# The audit daemon can sometimes disappear if things get hairy.
# * || check_ps_service -r auditd
# This is only valid for RHEL6 and similar/newer systems.
* || check_ps_service -d rsyslogd -r rsyslog
# In the case of MySQL, it's typically better to cycle.
# * || check_ps_service -c mysqld
# Double your core count is a good rule of thumb for load average max.
# * || check_ps_loadavg 24
# This should work if you place it after one of the check_hw_*() checks.
* || check_ps_loadavg $((2*HW_CORES))
#######################################################################
###
### TORQUE/Moab checks
###
# Monitor trqauthd to make sure it's always running.
# * || check_ps_service -u root -r trqauthd
# Same for pbs_mom...just make sure NHC runs out-of-band in some way!
# * || check_ps_service -u root -r pbs_mom
# On the master node, pbs_server gets monitored too.
# * || check_ps_service -u root -r pbs_server
# Verify Moab status and version.
# * || check_moab_sched -t 10 -v 7.2.3 -m '!/PAUSED/'
# RM engine sanity checks.
# * || check_moab_rm -t 10
# TORQUE configuration sanity checks.
# * || check_moab_torque -t 10
# Assert specific TORQUE settings that are critical to operation.
# * || check_file_contents $PBS_SERVER_HOME/mom_priv/config '/^\$pbsserver master$/' '/^\$spool_as_final_name true$/' '!/localhost/'
#######################################################################
###
### Other checks
###
# Check to verify that SELinux is disabled. (Remove the "-r 1" to verify it's enabled.)
# * || check_cmd_status -t 1 -r 1 selinuxenabled
# Verify settings for an Ethernet interface.
# * || check_cmd_output -m '/addr:10\.0\.0\.1/' -m '/Bcast:10\.0\.0\.255/' -m '/Mask:255\.255\.255\.0/' -m '/^[[:space:]]*UP /' /sbin/ifconfig eth3
# nVidia HealthMon GPU health checks (requires Tesla Development Kit)
# * || check_nv_healthmon
Loading…
Cancel
Save