Compare commits

..

1 Commits

Author SHA1 Message Date
Egon Rijpkema a9d1f4e5bd No 72h predictions on /var. 8 months ago
  1. 8
      hosts
  2. 15
      roles/inefficient_jobs_detector/files/find_inefficient_jobs
  3. 266
      roles/ldap/files/login_checks.sh
  4. 1
      roles/prom_server/tasks/main.yml
  5. 82
      roles/prom_server/templates/etc/alerting.rules
  6. 16
      roles/prom_server/templates/etc/pg-lustre.json
  7. 40
      roles/prom_server/templates/etc/targets.json
  8. 18
      roles/slurm-management/files/slurm.conf
  9. 2
      roles/slurm-management/files/slurmdbd.conf

8
hosts

@ -9,7 +9,7 @@ pg-node002 @@ -9,7 +9,7 @@ pg-node002
pg-node003
[compute_node]
pg-node[004:240]
pg-node[004:276]
dh-node11
dh-node12
dh-node13
@ -41,6 +41,9 @@ pg-node[101:150] @@ -41,6 +41,9 @@ pg-node[101:150]
[compute151-210]
pg-node[151:210]
[compute242-276]
pg-node[242:276]
[memory_node]
pg-memory[01:07]
@ -51,6 +54,9 @@ pg-gpu[01:42] @@ -51,6 +54,9 @@ pg-gpu[01:42]
[vgpu_node]
pg-gpu[07:42]
[vcpu_node]
pg-node[270:276]
[vult_node]
pg-node[230:234]

15
roles/inefficient_jobs_detector/files/find_inefficient_jobs

@ -91,15 +91,12 @@ def get_job_stats(start_time, end_time): @@ -91,15 +91,12 @@ def get_job_stats(start_time, end_time):
# Remove trailing G from MaxRSS
values[FIELDS.index('maxrss')] = values[FIELDS.index('maxrss')].split('G')[0] if values[FIELDS.index('maxrss')] else 0
# ReqMem can be specified as Gn (GB per node) or Gc (GB per core); calculate the total and remove the suffix.
#if values[FIELDS.index('reqmem')][-1] == 'n':
# values[FIELDS.index('reqmem')] = values[FIELDS.index('reqmem')][:-2]
#elif values[FIELDS.index('reqmem')][-1] == 'c':
# # Multiply by the (average) number of CPUs per node. The real number of cores per node cannot be obtained by sacct.
# cpn = int(values[FIELDS.index('ncpus')]) / int(values[FIELDS.index('nnodes')])
# values[FIELDS.index('reqmem')] = float(values[FIELDS.index('reqmem')][:-2]) * cpn
#
# ReqMem is specified per job in newer Slurm versions, so only remove the trailing G
values[FIELDS.index('reqmem')] = values[FIELDS.index('reqmem')].split('G')[0] if values[FIELDS.index('reqmem')] else 0
if values[FIELDS.index('reqmem')][-1] == 'n':
values[FIELDS.index('reqmem')] = values[FIELDS.index('reqmem')][:-2]
elif values[FIELDS.index('reqmem')][-1] == 'c':
# Multiply by the (average) number of CPUs per node. The real number of cores per node cannot be obtained by sacct.
cpn = int(values[FIELDS.index('ncpus')]) / int(values[FIELDS.index('nnodes')])
values[FIELDS.index('reqmem')] = float(values[FIELDS.index('reqmem')][:-2]) * cpn
# Convert all values to the specified type.
values = [t(v) for t,v in zip(FIELD_TYPES, values)]

266
roles/ldap/files/login_checks.sh

@ -1,198 +1,100 @@ @@ -1,198 +1,100 @@
#!/bin/bash
VARDIR=/var/lib/pam_script
VARLOG=$VARDIR/$PAM_USER
MOUNTPOINT1=/data
USERDIR1=$MOUNTPOINT1/$PAM_USER
MOUNTPOINT2=/scratch
USERDIR2=$MOUNTPOINT2/$PAM_USER
SACCTMGR=/usr/bin/sacctmgr
LFS=/usr/bin/lfs
AWK=/bin/awk
GREP=/bin/grep
LOGFILE=/tmp/log.$PAM_USER
GROUP=$( /usr/bin/id -g $PAM_USER )
SLURMACCOUNT=users,vulture
SSHDIR=$( eval /bin/echo ~$PAM_USER )/.ssh
set -u
#
##
### Variables.
##
#
SLURM_ACCOUNT='users'
# Set a tag for the log entries.
LOGGER='logger --tag login_checks'
#
##
### Functions.
##
#
#
# Usage: run_with_timeout N cmd args...
# or: run_with_timeout cmd args...
# In the second case, cmd cannot be a number and the timeout will be 10 seconds.
run_with_timeout () {
#
run_with_timeout () {
local time=10
if [[ $1 =~ ^[0-9]+$ ]]; then time=$1; shift; fi
# Run in a subshell to avoid job control messages
#
# Run in a subshell to avoid job control messages.
#
( "$@" &
child=$!
# Avoid default notification in non-interactive shell for SIGTERM
trap -- "" SIGTERM
( sleep $time
kill $child 2> /dev/null ) &
wait $child
child=$!
#
# Avoid default notification in non-interactive shell for SIGTERM.
#
trap -- "" SIGTERM
( sleep $time
kill $child 2> /dev/null
) &
wait $child
)
}
create_dir () {
if [ $# -ne 2 ]; then
echo "ERROR: create_dir expects both mountpoint and directory as arguments"
exit -1
fi
echo "Checking for $2"
# Check if MOUNTPOINT is a mountpoint
if ! mountpoint -q $1; then
echo "ERROR: Exiting $1 is not mounted"
exit -1
fi
# check if directory exists in MOUNTPOINT
if [ -d "$2" ]; then
echo Directory exists, skipping create
else
echo "Creating directory"
mkdir $2
chown $PAM_USER:$GROUP $2
chmod 700 $2
fi
# check if directory exists now
if [ -d "$2" ]; then
echo Directory exists, OK
else
echo "ERROR: Directory $2 should exist but doesn't"
exit -1
fi
}
create_ssh_key() {
echo "Checking for .ssh in $SSHDIR"
if [ ! -e $SSHDIR ]; then
echo "Creating $SSHDIR"
mkdir $SSHDIR
chmod 700 $SSHDIR
chown $PAM_USER:$GROUP $SSHDIR
else
echo ".ssh directory exists already, continuing"
fi
if [ ! -e $SSHDIR/id_rsa ]; then
echo "Creating key pair"
ssh-keygen -t rsa -N "" -f $SSHDIR/id_rsa
chmod 600 $SSHDIR/id_rsa
chown $PAM_USER:$GROUP $SSHDIR/id_rsa
chown $PAM_USER:$GROUP $SSHDIR/id_rsa.pub
echo "Adding key pair to authorized_keys"
if [ ! -e $SSHDIR/authorized_keys ]; then
cp $SSHDIR/id_rsa.pub $SSHDIR/authorized_keys
chmod 600 $SSHDIR/authorized_keys
chown $PAM_USER:$GROUP $SSHDIR/authorized_keys
else
cat $SSHDIR/id_rsa.pub >> $SSHDIR/authorized_keys
fi
else
echo "Key exists, checking for authorized_keys"
if [ ! -e $SSHDIR/authorized_keys ]; then
cp $SSHDIR/id_rsa.pub $SSHDIR/authorized_keys
chmod 600 $SSHDIR/authorized_keys
chown $PAM_USER:$GROUP $SSHDIR/authorized_keys
else
echo "authorized_keys exists, doing nothing"
fi
fi
echo "Final check for authorized_keys, to see if we are OK"
if [ ! -e $SSHDIR/authorized_keys ]; then
echo "ERROR: authorized_keys has not been generated"
exit -1
fi
}
set_quota () {
if [ $# -ne 5 ]; then
echo "ERROR: set_quota expects 4 values for quota and a file system name"
exit -1
fi
if [ "$PAM_USER" == "root" ]; then
return 0
fi
echo "Checking for existing quota in $5"
quota_user=$( $LFS quota -u $PAM_USER $5 | $GREP $5 | $AWK '{print $3}' )
quota_group=$( $LFS quota -g $GROUP $5 | $GREP $5 | $AWK '{print $3}' )
# Check if quota obtained are real numbers
if ! [[ $quota_user =~ ^-?[0-9]+$ && $quota_group =~ ^-?[0-9]+$ ]]; then
echo "ERROR: Strange quota"
exit -1
fi
# Add the quota for user and group, to check if either is set
# Quota user must be 0 for all users in the current situation.
quota=$(($quota_user + $quota_group))
# regexp for checking if quota are a number
echo Quota: $quota
# If quota are not set or a small value (default quota) they must be set
if [ $quota -le "4096" ]; then
echo "Setting quota for $5"
$LFS setquota -g $GROUP --block-softlimit $1 --block-hardlimit $2 --inode-softlimit $3 --inode-hardlimit $4 $5
if [ $? -ne 0 ]; then
echo "ERROR: Problem setting quota"
exit -1
fi
else
echo "FD: Quota already set, doing nothing"
fi
}
add_user_to_slurm() {
echo "Adding account to SLURM db"
user_exists=$( $SACCTMGR show user $PAM_USER | grep $PAM_USER )
if [ -z "$user_exists" ]; then
$SACCTMGR -i create user name=$PAM_USER account=$SLURMACCOUNT fairshare=1
if [ $? -ne 0 ]; then
echo "ERROR: Problem creating user in accounting database"
exit -1
login_actions () {
#
# Check if login user exists as SLURM user in the SLURM accounting DB.
#
if [ "$(sacctmgr -p list user "${PAM_USER}" format=User | grep -o "${PAM_USER}")" == "${PAM_USER}" ]; then
if [ "${PAM_USER}" != 'root' ]; then
# Only log for users other than root to prevend flooding the logs...
$LOGGER "User ${PAM_USER} already exists in SLURM DB."
fi
else
echo User already exists in slurm. OK.
fi
else
#
# Create account in SLURM accounting DB.
#
local _log_message="Creating user ${PAM_USER} in SLURM accounting DB..."
local _status="$(sacctmgr -iv create user name=${PAM_USER} account=${SLURM_ACCOUNT} fairshare=1 2>&1)"
#
# Checking for exit status does not work when executed by pam-script :(
# Therefore we explicitly re-check if the user now exists in the SLURM DB...
#
#if [ $? -eq 0 ]; then
if [ "$(sacctmgr -p list user "${PAM_USER}" format=User | grep -o "${PAM_USER}")" == "${PAM_USER}" ]; then
_log_message="${_log_message}"' done!'
else
_log_message="${_log_message}"' FAILED. You cannot submit jobs. Contact an admin!'
$LOGGER "${_status}"
fi
$LOGGER -s "${_log_message}"
fi
}
login_actions () {
echo "Checking if $PAM_USER has been handled already"
if [ -f "$VARLOG" ]; then
echo "User already known, exiting"
exit 0
fi
create_dir $MOUNTPOINT1 $USERDIR1
create_dir $MOUNTPOINT2 $USERDIR2
create_ssh_key
# Create account in SLURM accounting db
add_user_to_slurm
# set lustre-quota:
set_quota 20G 22G 100k 110k /home
set_quota 250G 275G 1000k 1100k /data
set_quota 10T 20T 5000k 5500k /scratch
# Final action: create file with username in /var directory
echo $( /usr/bin/getent passwd $PAM_USER | /bin/awk -F ':' '{print $5}' ) > $VARLOG
echo "Finished actions successfully"
}
# Log start of script
echo "Script starting" > $LOGFILE
# Run the desired actions with a timeout of 10 seconds
run_with_timeout 10 login_actions >> $LOGFILE
echo "Script finished" >> $LOGFILE
#
##
### Main.
##
#
#
# Make sure we execute this file only for interactive sessions with a real shell.
# Hence not for SFTP connections,
# which will terminate instantly when anything that is not a valid FTP command is printed on STDOUT or STDERR.
# For SFTP connections as well as SLURM jobs the TERM type is dumb,
# but in the first case there are no SLURM related environment variables defined.
#
# SOURCE_HPC_ENV variable checking disabled (it is not set ) Egon 30-10-2018
#if [ ${TERM} == 'dumb' ] && [ -z ${SOURCE_HPC_ENV} ]; then
if [ ${TERM} == 'dumb' ]; then
$LOGGER "debug: exiting because of dumb terminal"
exit 0
fi
#
# Run the desired login actions with a timeout of 10 seconds.
#
run_with_timeout 10 login_actions
exit 0

1
roles/prom_server/tasks/main.yml

@ -18,7 +18,6 @@ @@ -18,7 +18,6 @@
- alerting.rules
- datahandling.json
- gpu_nodes.json
- pg-lustre.json
- prometheus.yml
- targets.json

82
roles/prom_server/templates/etc/alerting.rules

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
groups:
- name: basic
rules:
- alert: InstanceDownFromKnyft
- alert: InstanceDown
expr: up{job="node"} == 0
for: 10m
labels:
@ -26,39 +26,70 @@ groups: @@ -26,39 +26,70 @@ groups:
annotations:
description: '{{ $labels.instance }} has a clock offset > 1 second.'
summary: '{{ $labels.instance }} has clock drift.'
- alert: high iowait
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[10m])) * 100) > 98
for: 10m
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: '{{ $labels.instance }} has high iowait.'
summary: '{{ $labels.instance }} has high iowait.'
- alert: extreme load
expr: node_load1 > 300
for: 5m
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/var|/target/gpfs3",fstype!~"fuse?"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/net|/cvmfs.*|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse"} < 5.24288e+06
for: 15m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} high load.'
summary: '{{ $labels.instance }} load > 300'
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse.*"}[2h], 8 * 3600) < 0
expr: predict_linear(node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/local|/target/gpfs3",fstype!~"fuse?"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
# Less than 30 MiB free on filesystems.
expr: node_filesystem_free_bytes{job="node",mountpoint!~"/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.*"} < 3.14573e+07
expr: node_filesystem_free_bytes{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse?"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} {{ $labels.mountpoint }} Disk full'
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free_bytes{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
@ -67,22 +98,6 @@ groups: @@ -67,22 +98,6 @@ groups:
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
- alert: LustreHealthNotOK
expr: lustre_health_check != 1
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} lustre down!.
summary: '{{ $labels.instance }} lustre down'
# - alert: TestLustreHealthIsOK
# expr: lustre_health_check == 1
# for: 1m
# labels:
# severity: page
# annotations:
# description: Instance {{ $labels.instance }} lustre down!.
# summary: '{{ $labels.instance }} lustre down'
- name: gpu
rules:
- alert: UncorrectedECC
@ -154,7 +169,7 @@ groups: @@ -154,7 +169,7 @@ groups:
annotations:
description: '{{ $labels.instance }}: Has high load.'
summary: '{{ $labels.instance }} has high load.'
- alert: oss01_high_load
- alert: LoadOSS
expr: 100*(1-avg(rate(node_cpu_seconds_total{instance="pg-oss01.hpc.local:9100",job="node",mode="idle"}[5m]))) > 75
for: 10m
labels:
@ -162,7 +177,7 @@ groups: @@ -162,7 +177,7 @@ groups:
annotations:
description: '{{ $labels.instance }}: Has high load.'
summary: '{{ $labels.instance }} has high load.'
- alert: oss02_high_load
- alert: LoadOSS
expr: 100*(1-avg(rate(node_cpu_seconds_total{instance="pg-oss02.hpc.local:9100",job="node",mode="idle"}[5m]))) > 75
for: 10m
labels:
@ -226,6 +241,7 @@ groups: @@ -226,6 +241,7 @@ groups:
annotations:
description: Merlin Ceph is in Warn state longer than 30m, please check status of pools and OSDs
summary: CEPH in WARN
#
# - alert: Node drained.
# expr: min_over_time(slurm_nodes_drain{job="slurm_exorter"}[2h]) < slurm_nodes_drain{job="slurm_exorter"}
# for: 1h

16
roles/prom_server/templates/etc/pg-lustre.json

@ -1,16 +0,0 @@ @@ -1,16 +0,0 @@
[
{
"targets": [
"pg-mds01.hpc.local:9100",
"pg-mds02.hpc.local:9100",
"pg-oss01.hpc.local:9100",
"pg-oss02.hpc.local:9100",
"pg-oss03.hpc.local:9100",
"pg-oss04.hpc.local:9100"
],
"labels": {
"env": "peregrine",
"job": "node"
}
}
]

40
roles/prom_server/templates/etc/targets.json

@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
"pg-gpu03:9100",
"pg-gpu04:9100",
"pg-gpu05:9100",
"pg-gpu06:9100",
"pg-gpu07:9100",
"pg-gpu08:9100",
"pg-gpu09:9100",
@ -253,8 +254,45 @@ @@ -253,8 +254,45 @@
"pg-node205:9100",
"pg-node206:9100",
"pg-node207:9100",
"pg-node208:9100",
"pg-node209:9100",
"pg-node210:9100"
"pg-node210:9100",
"pg-node242:9100",
"pg-node243:9100",
"pg-node244:9100",
"pg-node245:9100",
"pg-node246:9100",
"pg-node247:9100",
"pg-node248:9100",
"pg-node249:9100",
"pg-node250:9100",
"pg-node251:9100",
"pg-node252:9100",
"pg-node253:9100",
"pg-node254:9100",
"pg-node255:9100",
"pg-node256:9100",
"pg-node257:9100",
"pg-node258:9100",
"pg-node259:9100",
"pg-node260:9100",
"pg-node260:9100",
"pg-node261:9100",
"pg-node262:9100",
"pg-node263:9100",
"pg-node264:9100",
"pg-node265:9100",
"pg-node266:9100",
"pg-node267:9100",
"pg-node268:9100",
"pg-node269:9100",
"pg-node270:9100",
"pg-node271:9100",
"pg-node272:9100",
"pg-node273:9100",
"pg-node274:9100",
"pg-node275:9100",
"pg-node276:9100"
],
"labels": {
"env": "peregrine",

18
roles/slurm-management/files/slurm.conf

@ -20,6 +20,7 @@ SlurmctldPidFile=/var/run/slurmctld.pid @@ -20,6 +20,7 @@ SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
#MaxJobCount=
@ -42,6 +43,7 @@ JobSubmitPlugins=lua @@ -42,6 +43,7 @@ JobSubmitPlugins=lua
TmpFS=/local/tmp
#UsePAM=
#CheckpointType=checkpoint/blcr
JobCheckpointDir=/var/slurm/checkpoint
#
# Terminate job immediately when one of the processes is crashed or aborted.
KillOnBadExit=1
@ -60,10 +62,12 @@ Waittime=30 @@ -60,10 +62,12 @@ Waittime=30
#
# SCHEDULING
SchedulerType=sched/backfill
SchedulerPort=7321
SchedulerParameters=bf_max_job_assoc=50,bf_max_job_test=10000,partition_job_depth=100,bf_window=15000,bf_resolution=600,bf_continue,bf_busy_nodes,bf_job_part_count_reserve=10,bf_interval=60,sched_min_interval=1000000,pack_serial_at_end,max_rpc_cnt=150
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory
#SchedulerAuth=
#SchedulerRootFilter=
PriorityType=priority/multifactor
PriorityFlags=MAX_TRES
PriorityDecayHalfLife=4-0
@ -131,20 +135,21 @@ PartitionName=himem Nodes=himem MaxTime=10-00:00:00 DefaultTim @@ -131,20 +135,21 @@ PartitionName=himem Nodes=himem MaxTime=10-00:00:00 DefaultTim
PartitionName=regular Nodes=regular MaxTime=10-00:00:00 DefaultTime=00:30:00 AllowAccounts=users AllowQOS=regular,regularmedium,regularlong SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1875G" Default=YES
PartitionName=gelifes Nodes=gelifes MaxTime=10-00:00:00 DefaultTime=00:30:00 AllowAccounts=gelifes AllowQOS=gelifes,gelifesmedium,gelifeslong SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.125G"
PartitionName=lab Nodes=pg-lab01 MaxTime=24:00:00 DefaultTime=04:00:00 AllowAccounts=users AllowGroups=pg-lab SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=0.0,Mem=0.0"
PartitionName=vulture Nodes=merlin,dh,regular_no_ib MaxNodes=1 MaxTime=12:00:00 DefaultTime=00:30:00 AllowAccounts=vulture AllowQOS=vulture SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1G"
PartitionName=vulture Nodes=merlin,rugcloud,dh,regular_no_ib MaxNodes=1 MaxTime=12:00:00 DefaultTime=00:30:00 AllowAccounts=vulture AllowQOS=vulture SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1G"
#
# Node sets
#
NodeSet=short Nodes=pg-node[004-005]
NodeSet=regular Nodes=pg-node[006-162,164,166-167,170-175,177,179-184,186,188-193,196,198-205,209-209]
NodeSet=regular_no_ib Nodes=pg-node[163,165,168,169,176,178,185,187,194,195,197,206,207,210]
NodeSet=regular Nodes=pg-node[006-162,164,166-167,169-177,179-186,188-193,196,198-205,207-209]
NodeSet=regular_no_ib Nodes=pg-node[163,165,168,178,187,194,195,197,206,210]
NodeSet=k40 Nodes=pg-gpu[01-04]
NodeSet=v100_short Nodes=pg-gpu[07-08]
NodeSet=v100 Nodes=pg-gpu[09-42]
NodeSet=gelifes Nodes=pg-node[211-225]
NodeSet=himem Nodes=pg-memory[01-07]
NodeSet=merlin Nodes=pg-node[230-240]
NodeSet=rugcloud Nodes=pg-node[242-276]
NodeSet=dh Nodes=dh-node[12-13,15-17]
#
@ -153,12 +158,13 @@ NodeSet=dh Nodes=dh-node[12-13,15-17] @@ -153,12 +158,13 @@ NodeSet=dh Nodes=dh-node[12-13,15-17]
GresTypes=gpu
NodeName=pg-node[004-162] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Feature=24cores,centos7
NodeName=pg-gpu[01-04,06] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Gres=gpu:k40:2 Feature=24cores,centos7
NodeName=pg-gpu[07-10,13,16,19,20,27,30,31,33-37,39-42] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=4750 Gres=gpu:v100:1 Feature=12cores,centos7,monk
NodeName=pg-gpu[11,12,14,15,17,18,21-26,28,29,32,38] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=1048000 Gres=gpu:v100:1 Feature=12cores,centos7,monk,nvme
NodeName=pg-gpu[07-11,13-14,16,18-20,23-31,33-42] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=4750 Gres=gpu:v100:1 Feature=12cores,centos7,monk
NodeName=pg-gpu[12,15,17,21,22,32] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=1048000 Gres=gpu:v100:1 Feature=12cores,centos7,monk,nvme
NodeName=pg-memory[01-03] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=1031500 TmpDisk=860500 Feature=48cores,centos7
NodeName=pg-memory[04-07] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=2063500 TmpDisk=860500 Feature=48cores,centos7
NodeName=pg-node[163-210] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Feature=28cores,centos7
NodeName=pg-node[211-225] Sockets=2 CoresPerSocket=32 ThreadsPerCore=1 State=UNKNOWN RealMemory=512000 TmpDisk=15140200 Feature=64cores,centos7
NodeName=pg-lab01 Sockets=2 CoresPerSocket=20 ThreadsPerCore=1 State=UNKNOWN RealMemory=1547000 TmpDisk=7568500 Gres=gpu:rtx8000:2 Feature=centos7,dcv2
NodeName=pg-node[230-240] State=FUTURE
NodeName=pg-node[230-240,247,269] State=FUTURE
NodeName=pg-node[242-246,248-268,270-276] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=4750 Feature=12cores,centos7,monk
NodeName=dh-node[12-13,15-17] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Feature=28cores,centos7

2
roles/slurm-management/files/slurmdbd.conf

@ -10,7 +10,7 @@ DbdHost=knyft.hpc.rug.nl @@ -10,7 +10,7 @@ DbdHost=knyft.hpc.rug.nl
DebugLevel=info #was: 4
# Temporarily increased to find cause of crashes
#DebugLevel=debug5
PrivateData=usage
PrivateData=usage,users
PurgeEventAfter=2month
PurgeJobAfter=12months
PurgeResvAfter=1month

Loading…
Cancel
Save