Browse Source

Merge branch 'master' of ssh://git.web.rug.nl:222/HPC/pg-playbooks

pull/17/head
B.E. Droge 1 year ago
parent
commit
a4cb09cd33
  1. 2
      roles/slurm-management/defaults/main.yml
  2. 2
      roles/slurm-management/files/job_submit.lua
  3. 21
      roles/slurm-management/files/logrotate.d/slurmctld
  4. 6
      roles/slurm-management/files/logrotate.d/slurmdbd
  5. 24
      roles/slurm-management/files/slurm.conf
  6. 16
      roles/slurm-management/tasks/main.yml
  7. 12
      roles/slurm-management/vars/main.yml

2
roles/slurm-management/defaults/main.yml

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
---
slurm_version: 19.05.4
slurm_version: 20.11.8
slurm_uid: 497
slurm_gid: 501
munge_uid: 498

2
roles/slurm-management/files/job_submit.lua

@ -58,7 +58,7 @@ function slurm_job_submit(job_desc, part_list, submit_uid) @@ -58,7 +58,7 @@ function slurm_job_submit(job_desc, part_list, submit_uid)
end
-- Reject GPU jobs that do not request at least one GPU.
if ((job_desc.partition == "gpu" or job_desc.partition == "gpushort") and not job_desc.gres)) then
if ((job_desc.partition == "gpu" or job_desc.partition == "gpushort") and not job_desc.gres) then
slurm.log_user("Jobs submitted to the GPU partition need to request at least one GPU, see:\nhttps://wiki.hpc.rug.nl/peregrine/jobs/advanced_topics/running_jobs_on_gpus")
return slurm.ERROR
end

21
roles/slurm-management/files/logrotate.d/slurmctld

@ -0,0 +1,21 @@ @@ -0,0 +1,21 @@
##
# Slurm Logrotate Configuration
##
/var/log/slurm/slurmctld.log {
compress
missingok
nocopytruncate
nodelaycompress
nomail
notifempty
noolddir
rotate 10
sharedscripts
size=5M
create 0600 slurm root
postrotate
pkill -x --signal SIGUSR2 slurmctld
exit 0
endscript
}

6
roles/slurm-management/files/slurm.logrotate → roles/slurm-management/files/logrotate.d/slurmdbd

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
##
# Slurm Logrotate Configuration
##
/var/log/slurm/*.log {
/var/log/slurm/slurmdbd.log {
compress
missingok
nocopytruncate
@ -12,10 +12,10 @@ @@ -12,10 +12,10 @@
rotate 5
sharedscripts
size=5M
create 640 slurm root
create 0600 slurm root
postrotate
pkill -x --signal SIGUSR2 slurmctld
pkill -x --signal SIGUSR2 slurmdbd
exit 0
endscript
}

24
roles/slurm-management/files/slurm.conf

@ -40,7 +40,7 @@ TaskPlugin=task/affinity,task/cgroup @@ -40,7 +40,7 @@ TaskPlugin=task/affinity,task/cgroup
JobSubmitPlugins=lua
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
TmpFS=/local/tmp
#UsePAM=
#CheckpointType=checkpoint/blcr
JobCheckpointDir=/var/slurm/checkpoint
@ -155,15 +155,15 @@ NodeSet=dh Nodes=dh-node[11-13,15-17,19] @@ -155,15 +155,15 @@ NodeSet=dh Nodes=dh-node[11-13,15-17,19]
# COMPUTE NODES
#
GresTypes=gpu
NodeName=pg-node[004-162] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Feature=24cores,centos7
NodeName=pg-gpu[01-04,06] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Gres=gpu:k40:2 Feature=24cores,centos7
NodeName=pg-gpu[07-11,13-14,16,18-20,23-31,33-42] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Gres=gpu:v100:1 Feature=12cores,centos7,monk
NodeName=pg-gpu[12,15,17,21,22,32] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Gres=gpu:v100:1 Feature=12cores,centos7,monk,nvme
NodeName=pg-memory[01-03] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=1031500 Feature=48cores,centos7
NodeName=pg-memory[04-07] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=2063500 Feature=48cores,centos7
NodeName=pg-node[163-210] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Feature=28cores,centos7
NodeName=pg-node[211-225] Sockets=2 CoresPerSocket=32 ThreadsPerCore=1 State=UNKNOWN RealMemory=512000 Feature=64cores,centos7
NodeName=pg-lab01 Sockets=2 CoresPerSocket=20 ThreadsPerCore=1 State=UNKNOWN RealMemory=1547000 Gres=gpu:rtx8000:2 Feature=centos7,dcv2
NodeName=pg-node[004-162] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Feature=24cores,centos7
NodeName=pg-gpu[01-04,06] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Gres=gpu:k40:2 Feature=24cores,centos7
NodeName=pg-gpu[07-11,13-14,16,18-20,23-31,33-42] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=4750 Gres=gpu:v100:1 Feature=12cores,centos7,monk
NodeName=pg-gpu[12,15,17,21,22,32] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=1048000 Gres=gpu:v100:1 Feature=12cores,centos7,monk,nvme
NodeName=pg-memory[01-03] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=1031500 TmpDisk=860500 Feature=48cores,centos7
NodeName=pg-memory[04-07] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=2063500 TmpDisk=860500 Feature=48cores,centos7
NodeName=pg-node[163-210] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Feature=28cores,centos7
NodeName=pg-node[211-225] Sockets=2 CoresPerSocket=32 ThreadsPerCore=1 State=UNKNOWN RealMemory=512000 TmpDisk=15140200 Feature=64cores,centos7
NodeName=pg-lab01 Sockets=2 CoresPerSocket=20 ThreadsPerCore=1 State=UNKNOWN RealMemory=1547000 TmpDisk=7568500 Gres=gpu:rtx8000:2 Feature=centos7,dcv2
NodeName=pg-node[230-240] State=FUTURE
NodeName=pg-node[242-276] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Feature=12cores,centos7,monk
NodeName=dh-node[11-13,15-17,19] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Feature=28cores,centos7
NodeName=pg-node[242-276] Sockets=1 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=4750 Feature=12cores,centos7,monk
NodeName=dh-node[11-13,15-17,19] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 TmpDisk=860500 Feature=28cores,centos7

16
roles/slurm-management/tasks/main.yml

@ -42,12 +42,10 @@ @@ -42,12 +42,10 @@
become: true
- name: Install yum dependencies.
yum: name={{ item }} state=latest update_cache=yes
with_items:
- munge
- ntp
- mailx
- ssmtp
yum:
name: ["munge", "ntp", "mailx", "ssmtp"]
state: latest
update_cache: yes
become: true
- name: Install munge.key file.
@ -148,9 +146,9 @@ @@ -148,9 +146,9 @@
template:
src: 'files/slurmdbd.conf'
dest: '/etc/slurm/slurmdbd.conf'
owner: 'root'
group: 'root'
mode: '0644'
owner: 'slurm'
group: 'slurm'
mode: '0600'
notify: reload_slurmdbd
become: true

12
roles/slurm-management/vars/main.yml

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
$ANSIBLE_VAULT;1.1;AES256
31623737333935623739376631366131393038663161396361303463653639633430393037366335
6230313137353933323231613232366261666530393365610a636332616438386534663766343736
65626462303965653433646662666139343161656639643739643530363630376539323133396630
3737386464373064380a643964653433383239366639366330323539376631633738633531623235
30613534346361326265623663356637316266313331663537366136323162393230323562373537
6665353631383137323465633230613537323733396434633163
66343433616139623435393766316631393933626361626262373431646363663165366638376365
3662383265623466393065333266623165666236663637390a643335646565343430633261663338
39386337613434383532396462326461326566653835636265386432653133653363393866306334
3931393935306262390a396139386464383264366664643832306365303864353039353334313039
64313739393466363435343862633430333433366133323066323139663138663362303062633632
6436316365336432386562393033616131313031386639343730

Loading…
Cancel
Save