From aa545626eba3aba9096ae1a8de4c21fdacfbbc3b Mon Sep 17 00:00:00 2001 From: Egon Rijpkema Date: Tue, 1 Oct 2019 11:10:54 +0200 Subject: [PATCH] Added nhc and /apps --- apps.yml | 20 +++ roles/slurm-client/handlers/main.yml | 9 ++ roles/slurm-client/tasks/main.yml | 13 ++ roles/slurm-client/templates/nhc.conf | 208 ++++++++++++++++++++++++++ 4 files changed, 250 insertions(+) create mode 100644 apps.yml create mode 100644 roles/slurm-client/templates/nhc.conf diff --git a/apps.yml b/apps.yml new file mode 100644 index 0000000..25c61d0 --- /dev/null +++ b/apps.yml @@ -0,0 +1,20 @@ +--- +# Playbook that adds the /etc/fstab entry for /apps +- hosts: all + become: true + tasks: + - name: make endpoint + file: + path: /apps + mode: 0777 + state: directory + + - name: Add line to fstab + lineinfile: + path: /etc/fstab + line: "172.23.56.1:/software /apps nfs vers=3,rsize=8192,wsize=8192,tcp,async,lock 0 2" + + - name: mount all mountpoints in fstab + command: mount -a + args: + warn: false diff --git a/roles/slurm-client/handlers/main.yml b/roles/slurm-client/handlers/main.yml index e394ea1..ea14b8f 100644 --- a/roles/slurm-client/handlers/main.yml +++ b/roles/slurm-client/handlers/main.yml @@ -20,6 +20,14 @@ state: reloaded become: true listen: restart_slurmd + +- name: Restart nhc service. + systemd: + name: 'nhc.service' + state: restarted + become: true + listen: restart_nhc + # # Service reloads after restarts. # @@ -36,4 +44,5 @@ state: reloaded become: true listen: reload_slurmd + ... diff --git a/roles/slurm-client/tasks/main.yml b/roles/slurm-client/tasks/main.yml index c643664..82971f9 100644 --- a/roles/slurm-client/tasks/main.yml +++ b/roles/slurm-client/tasks/main.yml @@ -53,6 +53,19 @@ become: true +- name: Install nhc for merlin nodes + template: + src: nhc.conf + owner: root + group: root + mode: 0644 + dest: /etc/nhc/nhc.conf + notify: + - restart_nhc + become: true + when: inventory_hostname in groups['merlin-nodes'] + + - name: Start slurm and munge services systemd: name: "{{ item }}" diff --git a/roles/slurm-client/templates/nhc.conf b/roles/slurm-client/templates/nhc.conf new file mode 100644 index 0000000..f73cfd9 --- /dev/null +++ b/roles/slurm-client/templates/nhc.conf @@ -0,0 +1,208 @@ +# NHC Configuration File (sample) +# +# Lines are in the form "||" +# Hostmask is a glob, /regexp/, or {noderange} +# Comments begin with '#' +# +# $Id: nhc.conf 1774 2014-09-20 02:59:19Z mej $ +# + +####################################################################### +### +### NHC Configuration Variables +### +# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager + * || export NHC_RM=slurm + +# Do not mark nodes offline +# * || export MARK_OFFLINE=0 + +# Activate debugging mode +# * || export DEBUG=1 + +# Set watchdog timer to 15 seconds +# * || export TIMEOUT=15 + +# In out-of-band contexts, enable all checks +# * || export NHC_CHECK_ALL=1 + +# Make sure $PATH contains important directories for diagnostic commands +# * || export MOABHOMEDIR="/opt/moab" +# * || export PATH="$MOABHOMEDIR/bin:$PATH" + + +####################################################################### +### +### Hardware checks +### +# Set these to your correct socket, core, and thread counts. +# * || check_hw_cpuinfo 2 28 28 + +# Set these to the amount of physical RAM you have (leave the fudge factor). + * || check_hw_physmem 176gb 176gb 5% + +# Set these to the amount of swap you have (leave the fudge factor). + * || check_hw_swap 0g 0g 3% + +# If you prefer to use this instead of the previous two, you can. +# * || check_hw_mem 40g 40g 5% + +# Check specifically for free physical memory. + * || check_hw_physmem_free 1MB + +# Same, but for swap space. +# * || check_hw_swap_free 1MB + +# Check for some sort of free memory of either type. + * || check_hw_mem_free 2GB + +# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp. +# * || check_hw_ib 56 + +# Checks for an active Myrinet interface named "myri0." +# * || check_hw_gm myri0 + +# Checks for an active ethernet interface named "eth0." + * || check_hw_eth eth0 + +# Checks for an active ethernet interface named "eth1." + * || check_hw_eth eth1 + + +# Make sure we're running the correct BIOS version on all nodes. +# * || check_dmi_data_match "BIOS Information: Version: 1.0.4" + +# Make sure our RAM is running at the correct bus rate. +# * || check_dmi_data_match -t "Memory Device" "*Speed: 2133 MHz" + +# Check the mcelog daemon for any pending errors. + * || check_hw_mcelog + + +####################################################################### +### +### Filesystem checks +### +# All nodes should have their root filesystem mounted read/write. + * || check_fs_mount_rw -f / + +# Assert that /tmp is a mounted filesystem of type "tmpfs." +# * || check_fs_mount_rw -t tmpfs -f /tmp + +# Controlling TTYs are a good thing! + * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts + +# Make sure the root filesystem doesn't get too full. + * || check_fs_free / 3% + +# Free inodes are also important. + * || check_fs_ifree / 1k + +# The following illustrates how to assert an NFSv3 mount (or any other specific mount option). +# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home +* || check_fs_mount -s 172.23.56.1:/software -t nfs -o '/(^|,)vers=3(,|$)/' -f /apps +* || check_fs_mount -t lustre -f /home +* || check_fs_mount -t lustre -f /data +* || check_fs_mount -t lustre -f /scratch + + +####################################################################### +### +### File/metadata checks +### +# These should always be directories and always be read/write/execute and sticky. + * || check_file_test -r -w -x -d -k /tmp /var/tmp + +# These should always be readable and should never be empty. + * || check_file_test -r -s /etc/passwd /etc/group + +# Assert common properties for /dev/null (which occasionally gets clobbered). + * || check_file_test -c -r -w /dev/null /dev/zero + * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null + +# Make sure there's relatively recent activity from the syslog. +# * || check_file_stat -n 7200 /var/log/messages + +# Validate a couple important accounts in the passwd file. + * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*" + + +####################################################################### +### +### Process checks +### +# Everybody needs sshd running, right? But don't use -r (restart)! + * || check_ps_service -u root -S sshd + +# The cron daemon is another useful critter... + * || check_ps_service -r crond + +# Check for wulfd but don't manage it. +# * || check_ps_daemon wulfd root + +# Make sure no users are SSH'd in, but don't kill them. +# * || check_ps_blacklist sshd '!root' + +# Flag and kill any processes which are owned by unauthorized users. +# * || check_ps_unauth_users log syslog kill + +# Flag any user processes not properly parented. +# * || check_ps_userproc_lineage log syslog + +# Most systems also need NFS locking services. +# * || check_ps_service -d rpc.statd -r nfslock + +# The audit daemon can sometimes disappear if things get hairy. +# * || check_ps_service -r auditd + +# This is only valid for RHEL6 and similar/newer systems. + * || check_ps_service -d rsyslogd -r rsyslog + +# In the case of MySQL, it's typically better to cycle. +# * || check_ps_service -c mysqld + +# Double your core count is a good rule of thumb for load average max. +# * || check_ps_loadavg 24 +# This should work if you place it after one of the check_hw_*() checks. + * || check_ps_loadavg $((2*HW_CORES)) + + +####################################################################### +### +### TORQUE/Moab checks +### +# Monitor trqauthd to make sure it's always running. +# * || check_ps_service -u root -r trqauthd + +# Same for pbs_mom...just make sure NHC runs out-of-band in some way! +# * || check_ps_service -u root -r pbs_mom + +# On the master node, pbs_server gets monitored too. +# * || check_ps_service -u root -r pbs_server + +# Verify Moab status and version. +# * || check_moab_sched -t 10 -v 7.2.3 -m '!/PAUSED/' + +# RM engine sanity checks. +# * || check_moab_rm -t 10 + +# TORQUE configuration sanity checks. +# * || check_moab_torque -t 10 + +# Assert specific TORQUE settings that are critical to operation. +# * || check_file_contents $PBS_SERVER_HOME/mom_priv/config '/^\$pbsserver master$/' '/^\$spool_as_final_name true$/' '!/localhost/' + + +####################################################################### +### +### Other checks +### +# Check to verify that SELinux is disabled. (Remove the "-r 1" to verify it's enabled.) +# * || check_cmd_status -t 1 -r 1 selinuxenabled + +# Verify settings for an Ethernet interface. +# * || check_cmd_output -m '/addr:10\.0\.0\.1/' -m '/Bcast:10\.0\.0\.255/' -m '/Mask:255\.255\.255\.0/' -m '/^[[:space:]]*UP /' /sbin/ifconfig eth3 + +# nVidia HealthMon GPU health checks (requires Tesla Development Kit) +# * || check_nv_healthmon +