Browse Source

made a one stop compute_node playbook.

pull/12/head
Egon Rijpkema 3 years ago
parent
commit
e99824c7b8
  1. 14
      compute_node.yml
  2. 2
      hosts
  3. 287
      pg-tools.yml
  4. 54
      roles/slurm-client/tasks/main.yml
  5. 9
      roles/slurm-client/templates/plugstack.conf
  6. 29
      roles/slurm-client/templates/plugstack.conf.d/private-tmpdir.conf
  7. 27
      roles/slurm-client/templates/slurm.epilog
  8. 37
      roles/slurm-client/templates/slurm.epilog.clean
  9. 118
      roles/slurm-client/templates/slurm.epilog.jobinfo
  10. 11
      roles/slurm-client/templates/slurm.prolog
  11. 12
      roles/slurm-client/templates/slurm.taskprolog

14
compute_node.yml

@ -0,0 +1,14 @@ @@ -0,0 +1,14 @@
---
# All playbooks / roles to create a functioning compute_node (hopefully)
- import_playbook: apps.yml
- import_playbook: pg-tools.yml
- hosts: "compute_node"
become: true
roles:
- common
- ldap
- node_exporter
- slurm-client
- lustre_client

2
hosts

@ -8,7 +8,7 @@ pg-node001 @@ -8,7 +8,7 @@ pg-node001
pg-login
[compute_node]
pg-node[004:225]
pg-node[004:234]
[compute004-050]
pg-node[004:050]

287
pg-tools.yml

@ -0,0 +1,287 @@ @@ -0,0 +1,287 @@
# Usage: $ ansible-playbook pg-packages.yml
# install all packages
---
- name: install
hosts: compute_node
tasks:
- name: tools
yum:
name:
- ImageMagick
- Lmod
- ORBit2-devel
- alsa-lib-devel
- atop
- attr
- bash-completion
- cowsay
- cvmfs
- cvmfs-config-euclid
- cvmfs-config-none
- dbus-glib-devel
- dos2unix
- elfutils-devel
- emacs
- evolution-data-server-devel
- expect
- finger
- firefox
- ftp
- gawk
- gcc
- gcc-c++
- gd
- gdbm-devel
- gettext-devel
- glibc-devel
- gnome-vfs2-devel
- gstreamer-devel
- gstreamer-plugins-base-devel
- gtk2-devel
- guile-devel
- gvfs-devel
- hdf5-devel
- htop
- hwloc
- hwloc-devel
- infiniband-diags
- infinipath-psm
- iperf
- ipmitool
- irods-icommands
- java-1.6.0-openjdk-devel
- java-1.7.0-openjdk-devel
- java-1.8.0-openjdk
- jobinfo
- kernel-devel
- kernel-headers
- keyutils-libs-devel
- ksh
- lbnl-nhc
- libICE-devel
- libIDL-devel
- libXcomposite-devel
- libXft-devel
- libXmu-devel
- libXp-devel
- libXxf86vm-devel
- libattr-devel
- libcap
- libcap-devel
- libcgroup
- libcgroup-devel
- libcom_err-devel
- libcurl-devel
- libgcc.i686
- libgcrypt-devel
- libgdata-devel
- libgdiplus
- libgeotiff-devel
- libgnomecanvas-devel
- libgnomeui-devel
- libgpg-error-devel
- libgsf-devel
#- libibcm-devel
#- libibumad-devel
- libical-devel
- libjpeg-turbo-devel
- libpng-devel
- libpng12
- librdmacm-devel
- librsvg2-devel
- libstdc++
- libstdc++.i686
- libtasn1-devel
- libwmf-lite
- libxcb-devel
- libxslt-devel
- lua-devel
- make
- mc
- mesa-dri-drivers
- mysql
- mysql-devel
- nano
- nedit
- nspr-devel
- nss-softokn-devel
- nss-softokn-freebl-devel
- ntp
- numactl
- numactl-devel
- opensm
- openssh-ldap
- openssl-devel
- pam-devel
- papi-devel
- parallel
- perftest
- perl
- perl
- perl-CPAN
- perl-DBI
- perl-ExtUtils-CBuilder
- perl-ExtUtils-MakeMaker
- perl-Switch
- perl-Switch
- perl-XML-LibXML
- perl-YAML
- perl-devel
- perl-libxml-perl
- pixman-devel
- plplot-libs
- pmix
- policycoreutils-python
- polkit-devel
- postgresql-devel
- pygobject2-devel
- qperf
- qpid-cpp-server
- qpid-cpp-server-linearstore
- qpid-tools
- readline-devel
- rpm-build
- screen
- sg3_utils
- singularity
- singularity-runtime
- sl
- sqlite-devel
- squashfuse
- systemtap-devel
- tcsh
- telnet
- time
- tk-devel
- traceroute
- unzip
- vim
- xorg-x11-apps
- xorg-x11-proto-devel
- xorg-x11-server-Xvfb
- xorg-x11-xauth
- xterm
- yum-plugin-priorities
- zip
- binutils-devel
- zlib-devel
- bzip2-devel
- atk-devel
- libXext-devel
- libXcursor-devel
- libXinerama-devel
- cairo-devel
- libmng-devel
- pciutils-devel
- mesa-libGLw-devel
- libibmad-devel
- libsepol-devel
- libvirt-java-devel
# - blcr-devel
- opensm-devel
- libicu-devel
- expat-devel
- unixODBC-devel
- openssl-devel
- db4-devel
- libglade2
- libart_lgpl-devel
- nss-util-devel
- popt-devel
- libSM-devel
- pycairo-devel
- GConf2-devel
- gnome-keyring-devel
- libsoup-devel
- check-devel
- libbonoboui-devel
- startup-notification-devel
- gnome-desktop-devel
- libcanberra-devel
- boost-devel
- pulseaudio-libs-devel
- libcap-ng-devel
- freetype-devel
- libXau-devel
- libXrender-devel
- libXrandr-devel
- libXdamage-devel
- pango-devel
- mesa-libGL-devel
- qt-devel
- libXtst-devel
- libselinux-devel
- libstdc++-devel
- libvirt-devel
- proj-devel
- libconfuse-devel
- krb5-devel
- dbus-devel
- elfutils-libelf-devel
- python-devel
- libidn-devel
- libbonobo-devel
- gnutls-devel
- gmp-devel
- nss-devel
- libgnome-devel
- geos-devel
- libnotify-devel
- pygtk2-devel
- tcl-devel
- audit-libs-devel
- fontconfig-devel
- libXt-devel
- libXfixes-devel
- openmotif-devel
- rrdtool-devel
- readline-devel
- libcroco-devel
- libgweather-devel
- libacl-devel
- gdal-devel
- tbb-devel
- glib2-devel
- cyrus-sasl-devel
- libX11-devel
- libXi-devel
- libdrm-devel
- mesa-libGLU-devel
- libxml2-devel
- qt3-devel
- ncurses-devel
- gtk2-devel
- numactl-devel
- ncurses-devel
- irods-dev
- plplot-devel
- libstdc++-devel.i686
- libtar
- lapack-devel
- infinipath-psm-devel
- atlas-devel
- tcl-devel
- tk-devel
- libstdc++-devel
- java-1.8.0-openjdk-devel
- opensm-devel
- libicu-devel
- gnuplot
- yum-utils
- gdb
- bc
- bison
- byacc
- doxygen
- flex
- fuse-devel
- gstreamer-plugins-good
- libaio-devel
- libXaw-devel
- libXpm-devel
- subversion
- subversion-libs
- swig
- perl-PerlIO-gzip
- dialog
state: present

54
roles/slurm-client/tasks/main.yml

@ -16,9 +16,22 @@ @@ -16,9 +16,22 @@
state: latest
update_cache: yes
name:
- lbnl-nhc
- munge
- munge-devel
- munge-libs
- slurm
- slurm-contribs
- slurm-devel
- slurm-munge
- slurm-pam_slurm
- slurm-perlapi
- slurm-plugins
- slurm-sjobexit
- slurm-sjstat
- slurm-slurmd
- lbnl-nhc
- slurm-spank-private-tmpdir
- slurm-torque
notify:
- restart_munge
- restart_slurmd
@ -36,6 +49,21 @@ @@ -36,6 +49,21 @@
- restart_slurmd
become: true
- name: make /etc/slurm/plugstack.conf.d
file:
path: /etc/slurm/plugstack.conf.d
state: directory
owner: root
mode: 0755
- name: Deploy slurm config files.
template:
src: plugstack.conf.d/private-tmpdir.conf
dest: /etc/slurm/plugstack.conf.d/private-tmpdir.conf
owner: root
group: root
mode: 0644
- name: Deploy slurm config files.
template:
src: "{{ item }}"
@ -44,10 +72,28 @@ @@ -44,10 +72,28 @@
group: root
mode: 0644
with_items:
- roles/slurm-management/files/slurm.conf
- gres.conf
- cgroup.conf
- acct_gather.conf
- cgroup.conf
- gres.conf
- plugstack.conf
- roles/slurm-management/files/slurm.conf
notify:
- reload_slurmd
become: true
- name: Deploy slurm pro/epilog
template:
src: "{{ item }}"
dest: /etc/slurm/{{ item | basename }}
owner: root
group: root
mode: 0755
with_items:
- slurm.epilog
- slurm.epilog.clean
- slurm.epilog.jobinfo
- slurm.prolog
- slurm.taskprolog
notify:
- reload_slurmd
become: true

9
roles/slurm-client/templates/plugstack.conf

@ -0,0 +1,9 @@ @@ -0,0 +1,9 @@
#
# SLURM plugin stack config file for "hpc2n-tmpdir"
#
# required/optional plugin arguments
#
# Note: It's OK to bind-mount /tmp _IF_ you do it as last mount.
#
#required /usr/lib64/slurm/hpc2n-tmpdir.so base=/dev/shm/slurm mount=/dev/shm
required private-tmpdir.so base=/local/slurm mount=/local base=/dev/shm/slurm mount=/dev/shm base=/tmp/slurm mount=/tmp

29
roles/slurm-client/templates/plugstack.conf.d/private-tmpdir.conf

@ -0,0 +1,29 @@ @@ -0,0 +1,29 @@
# SLURM plugin stack config file for private-tmpdir
#
# Required configuration parameters:
#
# base: For each job the plugin will create a directory named
# $base.$SLURM_JOB_ID.$SLURM_RESTART_COUNT
#
# It is possible to specify multiple bases. This is typically
# used to have bind mounts to multiple file systems.
#
# mount: Private mount point. This can be specified more than once.
#
# For each mount, a directory will be created in the latest
# specified base dir and then bind mounted on the specified
# mount point.
#
# If a mount parameter is a parent of the base parameter it
# have to be specified last. Also only one such mount will
# work.
#
# Example configuration:
#-------------------------------------------------------------------------------
# required private-tmpdir.so base=/tmp/slurm mount=/var/tmp mount=/tmp
#-------------------------------------------------------------------------------
#
# Example configuration with multiple base parameters:
#-------------------------------------------------------------------------------
# required private-tmpdir.so base=/dev/shm/slurm mount=/dev/shm base=/tmp/slurm mount=/var/tmp mount=/tmp
#-------------------------------------------------------------------------------

27
roles/slurm-client/templates/slurm.epilog

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
#!/bin/bash
if [ x$SLURM_JOB_ID = "x" ] ; then
exit 0
fi
SCRATCHDIR=/scratch/jobs/${SLURM_JOB_ID}
if [ -d ${SCRATCHDIR} ] ; then
rm -rf ${SCRATCHDIR}
fi
if [ x$SLURM_RESTART_COUNT = "x" ] ; then
SLURM_RESTART_COUNT=0
fi
DEVSHMDIR=/dev/shm/slurm.${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}
if [ -d ${DEVSHMDIR} ] ; then
rm -rf ${DEVSHMDIR}
fi
LOCALDIR=/local/slurm.${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}
if [ -d ${LOCALDIR} ] ; then
rm -rf ${LOCALDIR}
fi
MYTMPDIR=/tmp/slurm.${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}
if [ -d ${MYTMPDIR} ] ; then
rm -rf ${MYTMPDIR}
fi

37
roles/slurm-client/templates/slurm.epilog.clean

@ -0,0 +1,37 @@ @@ -0,0 +1,37 @@
#!/bin/sh
#
# This script will kill any user processes on a node when the last
# SLURM job there ends. For example, if a user directly logs into
# an allocated node SLURM will not kill that process without this
# script being executed as an epilog.
#
# SLURM_BIN can be used for testing with private version of SLURM
#SLURM_BIN="/usr/bin/"
#
if [ x$SLURM_UID = "x" ] ; then
exit 0
fi
if [ x$SLURM_JOB_ID = "x" ] ; then
exit 0
fi
#
# Don't try to kill user root or system daemon jobs
#
if [ $SLURM_UID -lt 100 ] ; then
exit 0
fi
job_list=`${SLURM_BIN}squeue --noheader --format=%A --user=$SLURM_UID --node=localhost`
for job_id in $job_list
do
if [ $job_id -ne $SLURM_JOB_ID ] ; then
exit 0
fi
done
#
# No other SLURM jobs, purge all remaining processes of this user
#
pkill -KILL -U $SLURM_UID
exit 0

118
roles/slurm-client/templates/slurm.epilog.jobinfo

@ -0,0 +1,118 @@ @@ -0,0 +1,118 @@
#!/bin/sh
#
# This epilog script writes some job details to the output file of completed jobs.
#
# This epilog only works with a valid $SLURM_JOB_ID.
if [ x$SLURM_JOB_ID = "x" ] ; then
exit 0
fi
# Only run this epilog on the first node in the list of allocated nodes.
if [ `scontrol show hostnames | head -n 1` = `hostname` ]
then
#########################################################################
# #
# Give the user some basic details about the completed job #
# #
#########################################################################
#################################
# #
# Grab some job details #
# #
#################################
workdir=`scontrol show job $SLURM_JOB_ID | grep WorkDir | cut -d= -f2`
outfile=`scontrol show job $SLURM_JOB_ID | grep StdOut | cut -d= -f2`
# No need for cluster name on Peregrine
#cluster=`scontrol show config | grep ClusterName | awk '{print $3}'`
#################################################
# #
# Recording the data for the future #
# Do we want this? #
# #
#################################################
#recordsdirbase=/home/support/root/slurm_job_records # so we can keep a record of job details for the future
#recordsdir=$recordsdirbase/$cluster/$(date +%Y) # per-cluster and per-year, to reduce the size of each dir
# protect the permissions of the base level, just in case
#if [ ! -d "$recordsdirbase" ]
#then
# mkdir -p $recordsdirbase
# chmod 700 $recordsdirbase
#fi
# create the records dir if required
#if [ ! -d "$recordsdir" ]
#then
# mkdir -p $recordsdir
#fi
if [ -f "$outfile" ]
then
# main output
cat <<-EOF >> "$outfile"
###############################################################################
Peregrine Cluster
Job $SLURM_JOB_ID for user '$SLURM_JOB_USER'
Finished at: $(date)
Job details:
============
$(jobinfo $SLURM_JOB_ID)
EOF
#################################
# #
# We don't use quota #
# #
#################################
# # is 'myquota' enabled ?
#
# # make sure that myquota will work - the PATH isn't exported by default,
#
# export PATH
# myquota -u $SLURM_JOB_USER > /dev/null 2>&1
# displayQuota = 0
# if [$displayQuota]
# then
# cat <<-EOF >> "$outfile"
# Disk quota details:
# ===================
# Name Filesystem Used Quota Limit Grace Files Quota Limit Grace
# $SLURM_JOB_USER $(lfs quota -h /home | grep -i "home" | head -n 1)
# $SLURM_JOB_USER $(lfs quota -h /data | grep -i "data" | head -n 1)
#
# EOF
# fi
# finish output
cat <<-EOF >> "$outfile"
Acknowledgements:
=================
Please see this page if you want to acknowledge Peregrine in your publications:
https://redmine.hpc.rug.nl/redmine/projects/peregrine/wiki/ScientificOutput
################################################################################
EOF
fi
# 20150210 - Sean
# Save the details of a job by doing an scontrol show job=job
# So it can be referenced for trubleshooting in future if needed
# should be run by the slurm epilog
#/usr/bin/scontrol show job=$SLURM_JOB_ID > $recordsdir/$SLURM_JOBID.record
fi

11
roles/slurm-client/templates/slurm.prolog

@ -0,0 +1,11 @@ @@ -0,0 +1,11 @@
#!/bin/bash
if [ x$SLURM_JOB_ID = "x" ] ; then
exit 0
fi
#TMPDIR=/local/${SLURM_JOB_ID}
#mkdir -p ${TMPDIR} && chown ${SLURM_JOB_USER}.${SLURM_JOB_GROUP} ${TMPDIR} && chmod 700 ${TMPDIR}
SCRATCHDIR=/scratch/jobs/${SLURM_JOB_ID}
mkdir -p ${SCRATCHDIR} && chown ${SLURM_JOB_USER}.${SLURM_JOB_GROUP} ${SCRATCHDIR} && chmod 700 ${SCRATCHDIR}

12
roles/slurm-client/templates/slurm.taskprolog

@ -0,0 +1,12 @@ @@ -0,0 +1,12 @@
#!/bin/bash
if [ x$SLURM_JOB_ID = "x" ] ; then
exit 0
fi
#TMPDIR=/local/${SLURM_JOB_ID}
TMPDIR=/local/tmp
SCRATCHDIR=/scratch/jobs/${SLURM_JOB_ID}
echo "export TMPDIR=${TMPDIR}"
echo "export SCRATCHDIR=${SCRATCHDIR}"
Loading…
Cancel
Save