diff --git a/etc_hosts.yml b/etc_hosts.yml index 9bba23b..c8c929e 100644 --- a/etc_hosts.yml +++ b/etc_hosts.yml @@ -1,7 +1,7 @@ --- # Install a peregrine /etc/hosts -- hosts: all +- hosts: all-except-storage become: true tasks: - copy: diff --git a/hosts b/hosts index f4c411a..17f00f6 100644 --- a/hosts +++ b/hosts @@ -99,3 +99,11 @@ pg-node[211:225] [merlin-nodes] pg-node[230:234] + +[all-except-storage:children] +compute_node +gpu_node +interactive +login +memory_node +scheduler diff --git a/nvidia-exporter.yml b/nvidia-exporter.yml new file mode 100644 index 0000000..4c5010e --- /dev/null +++ b/nvidia-exporter.yml @@ -0,0 +1,5 @@ +--- +- hosts: gpu_node + become: True + roles: + - nvidia_exporter diff --git a/roles/nvidia_exporter/tasks/main.yml b/roles/nvidia_exporter/tasks/main.yml new file mode 100644 index 0000000..cffc2c0 --- /dev/null +++ b/roles/nvidia_exporter/tasks/main.yml @@ -0,0 +1,36 @@ +--- +- file: + path: /usr/local/prometheus + state: directory + mode: 0755 + +- name: Install smi exporter + copy: + src: "{{ playbook_dir }}/promtools/results/nvidia-exporter" + dest: /usr/local/prometheus/nvidia-exporter + mode: 0755 + +- name: Install service files. + template: + src: templates/nvidia-exporter.service + dest: /etc/systemd/system/nvidia-exporter.service + mode: 644 + owner: root + group: root + tags: + - service-files + +- name: install service files + command: systemctl daemon-reload + +- name: enable service at boot + systemd: + name: nvidia-smi-exporter.service + enabled: yes + +- name: make sure servcies are started. + systemd: + name: nvidia-smi-exporter.service + state: restarted + tags: + - start-service diff --git a/roles/nvidia_exporter/templates/nvidia-exporter.service b/roles/nvidia_exporter/templates/nvidia-exporter.service new file mode 100644 index 0000000..1198700 --- /dev/null +++ b/roles/nvidia_exporter/templates/nvidia-exporter.service @@ -0,0 +1,10 @@ +[Unit] +Description=prometheus nvidia exporter + +[Service] +TimeoutStartSec=0 +Restart=always +ExecStart=/usr/local/prometheus/nvidia-exporter \ + +[Install] +WantedBy=multi-user.target diff --git a/roles/slurm-management/files/slurm-19.05.28-1.tar.gz b/roles/slurm-management/files/slurm-19.05.28-1.tar.gz new file mode 100644 index 0000000..bfec99e Binary files /dev/null and b/roles/slurm-management/files/slurm-19.05.28-1.tar.gz differ diff --git a/test-hosts b/test-hosts new file mode 100644 index 0000000..d74a9af --- /dev/null +++ b/test-hosts @@ -0,0 +1,20 @@ +[interactive] +centos7-test + +[prometheus_proxy] +centos7-test + +[scheduler] +centos7-test mailhub=172.23.56.1 rewrite_domain=knyft.hpc.rug.nl docker_storage_device=/dev/vdb + +[login] +centos7-test + +[node] +centos7-test + +[metadata] +centos7-test + +[peregrine:children] +scheduler diff --git a/tmp b/tmp new file mode 100644 index 0000000..4087f53 --- /dev/null +++ b/tmp @@ -0,0 +1,61 @@ +--- +- name: install lustre client + yum: + name: lustre-client-2.10.5-1.el7.x86_64 + state: present + update_cache: yes + disable_gpg_check: yes + become: true + +- name: make endpoints to mount datahandling storage on. + file: + path: "{{ item }}" + mode: 0777 + state: directory + with_items: + - /home + - /data + - /scratch + +- name: load the lustre kernel module. + modprobe: + name: lustre + state: present + +- name: set lustre.conf + template: + src: templates/lustre.conf + dest: /etc/modprobe.d/lustre.conf + mode: 0644 + owner: root + group: root + backup: no + +- name: Mount /home + mount: + path: /home + src: 172.23.55.211@tcp11:172.23.55.212@tcp11:/home + fstype: lustre + opts: ro,seclabel,lazystatfs + state: present + +- name: Mount /data + mount: + path: /data + src: 172.23.55.211@tcp11:172.23.55.212@tcp11:/data + fstype: lustre + opts: rw,seclabel,lazystatfs + state: present + +- name: Mount /scratch + mount: + path: /scratch + src: 172.23.55.211@tcp11:172.23.55.212@tcp11:/scratch + fstype: lustre + opts: rw,seclabel,lazystatfs + state: present + +- name: mount all mountpoints in fstab + command: mount -a + args: + warn: false