Would like to move to to https://github.com/rug-cit-hpc/pg-playbooks but has large files...
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

145 lines
5.2 KiB

ClusterName=Peregrine
ControlMachine=knyft.hpc.rug.nl
ControlAddr=knyft.hpc.rug.nl
#BackupController=
#BackupAddr=
#
SlurmUser=root
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=pmi2
MpiParams=ports=12000-12999
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
Prolog=/etc/slurm/slurm.prolog
PrologFlags=Alloc
Epilog=/etc/slurm/slurm.epilog*
#SrunProlog=
#SrunEpilog=
TaskProlog=/etc/slurm/slurm.taskprolog
#TaskEpilog=/etc/slurm/slurm.taskepilog
#TaskPlugin=affinity
TaskPlugin=task/cgroup
JobSubmitPlugins=lua
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#CheckpointType=checkpoint/blcr
JobCheckpointDir=/var/slurm/checkpoint
#
# Terminate job immediately when one of the processes is crashed or aborted.
KillOnBadExit=1
# Do not automatically requeue jobs after a node failure
JobRequeue=0
# Cgroups already enforce resource limits, SLURM should not do this
MemLimitEnforce=no
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=43200
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=1
#
# SCHEDULING
SchedulerType=sched/backfill
SchedulerPort=7321
SchedulerParameters=bf_max_job_user=200,bf_max_job_test=10000,default_queue_depth=500,bf_window=14400,bf_resolution=300,kill_invalid_depend,bf_continue,bf_min_age_reserve=3600
SelectType=select/cons_res
# 13jan2016: disabled CR_ONE_TASK_PER_CORE (HT off) and CR_ALLOCATE_FULL_SOCKET (deprecated)
SelectTypeParameters=CR_Core_Memory
#SchedulerAuth=
#SchedulerRootFilter=
FastSchedule=1
PriorityType=priority/multifactor
PriorityFlags=MAX_TRES
PriorityDecayHalfLife=7-0
PriorityFavorSmall=NO
# Not necessary if there is a decay
#PriorityUsageResetPeriod=14-0
PriorityWeightAge=5000
PriorityWeightFairshare=100000
PriorityWeightJobSize=0
PriorityWeightPartition=0
PriorityWeightQOS=0
PriorityMaxAge=100-0
#
# Reservations
ResvOverRun=UNLIMITED
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/slurm.jobcomp
#
# ACCOUNTING
#AcctGatherEnergyType=acct_gather_energy/rapl
#JobAcctGatherFrequency=energy=30
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherParams=UsePss,NoOverMemoryKill
JobAcctGatherType=jobacct_gather/cgroup
# Users have to be in the accounting database
# (otherwise we don't have accounting records and fairshare)
#AccountingStorageEnforce=associations
AccountingStorageEnforce=limits,qos # will also enable: associations
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=knyft.hpc.rug.nl
#AccountingStorageLoc=/var/log/slurm/slurm.accounting
#AccountingStoragePass=
#AccountingStorageUser=
MaxJobCount=100000
#
# Job profiling
#
#AcctGatherProfileType=acct_gather_profile/hdf5
#JobAcctGatherFrequency=30
#
# Health Check
#
HealthCheckProgram=/usr/sbin/nhc
HealthCheckInterval=300
#
# Partitions
#
EnforcePartLimits=YES
PartitionName=DEFAULT State=UP DefMemPerCPU=2000
PartitionName=short Nodes=pg-node[004-210] MaxTime=00:30:00 DefaultTime=00:30:00 AllowQOS=short SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1875G" Priority=1
PartitionName=gpu Nodes=pg-gpu[01-06] MaxTime=3-00:00:00 DefaultTime=00:30:00 AllowQOS=gpu,gpulong SelectTypeParameters=CR_Socket_Memory TRESBillingWeights="CPU=1.0,Mem=0.1875G"
PartitionName=himem Nodes=pg-memory[01-07] MaxTime=10-00:00:00 DefaultTime=00:30:00 AllowQOS=himem,himemlong SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.0234375G"
PartitionName=target Nodes=pg-node[100-103] MaxTime=3-00:00:00 DefaultTime=00:30:00 AllowGroups=pg-gpfs,monk AllowQOS=target SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1875G" Priority=2
#PartitionName=euclid Nodes=pg-node[161,162] MaxTime=10-00:00:00 DefaultTime=00:30:00 AllowGroups=beheer,f111959,f111867,p251204,f113751 AllowQOS=target SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1875G"
PartitionName=regular Nodes=pg-node[004-099,104-210] MaxTime=10-00:00:00 DefaultTime=00:30:00 AllowQOS=regular,regularlong SelectTypeParameters=CR_Core_Memory TRESBillingWeights="CPU=1.0,Mem=0.1875G" Default=YES
#
# COMPUTE NODES
#
GresTypes=gpu
NodeName=pg-node[004-162] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Feature=24cores,centos7
NodeName=pg-gpu[01-06] Sockets=2 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Gres=gpu:k40:2 Feature=24cores,centos7
NodeName=pg-memory[01-03] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=1031500 Feature=48cores,centos7
NodeName=pg-memory[04-07] Sockets=4 CoresPerSocket=12 ThreadsPerCore=1 State=UNKNOWN RealMemory=2063500 Feature=48cores,centos7
NodeName=pg-node[163-210] Sockets=2 CoresPerSocket=14 ThreadsPerCore=1 State=UNKNOWN RealMemory=128500 Feature=28cores,centos7