Browse Source

Using a prometheus server on knyft instead of prox

pull/8/head
Egon Rijpkema 4 years ago
parent
commit
a62ebcbde7
  1. 1
      ansible.cfg
  2. 2
      hosts
  3. 1
      node_exporter.yml
  4. 5
      prometheus.yml
  5. 5
      promproxy.yml
  6. 50
      roles/prom_server/tasks/main.yml
  7. 97
      roles/prom_server/templates/etc/alerting.rules
  8. 49
      roles/prom_server/templates/etc/datahandling.json
  9. 16
      roles/prom_server/templates/etc/gpu_nodes.json
  10. 128
      roles/prom_server/templates/etc/prometheus.yml
  11. 237
      roles/prom_server/templates/etc/targets.json
  12. 19
      roles/prom_server/templates/prometheus.service

1
ansible.cfg

@ -3,3 +3,4 @@ inventory = hosts @@ -3,3 +3,4 @@ inventory = hosts
host_key_checking = False
forks = 20
stdout_callback = debug
vault_password_file = .vault_pass.txt

2
hosts

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
[scheduler]
pg-node001 mailhub=172.23.56.1 rewrite_domain=knyft.hpc.rug.nl
[prometheus_proxy]
[prometheus]
pg-node001
[login]

1
node_exporter.yml

@ -7,4 +7,3 @@ @@ -7,4 +7,3 @@
become: True
roles:
- node_exporter
- prom_client

5
prometheus.yml

@ -0,0 +1,5 @@ @@ -0,0 +1,5 @@
---
- hosts: prometheus
become: True
roles:
- prom_server

5
promproxy.yml

@ -1,5 +0,0 @@ @@ -1,5 +0,0 @@
---
- hosts: prometheus_proxy
become: True
roles:
- prom_proxy

50
roles/prom_server/tasks/main.yml

@ -0,0 +1,50 @@ @@ -0,0 +1,50 @@
---
- file:
path: /srv/prometheus
state: directory
mode: 0755
with_items:
- /srv/prometheus/etc/prometheus
- /srv/prometheus/prometheus
- name: Install settings files.
copy:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- alerting.rules
- datahandling.json
- gpu_nodes.json
- prometheus.yml
- targets.json
tags:
- service-files
- name: Install service files.
template:
src: templates/prometheus.service
dest: /etc/systemd/system/prometheus.service
mode: 644
owner: root
group: root
tags:
- service-files
- name: install service files
command: systemctl daemon-reload
- name: enable service at boot
systemd:
name: prometheus.service
enabled: yes
- name: make sure servcies are started.
systemd:
name: prometheus.service
state: restarted
tags:
- start-service

97
roles/prom_server/templates/etc/alerting.rules

@ -0,0 +1,97 @@ @@ -0,0 +1,97 @@
groups:
- name: basic
rules:
- alert: InstanceDown
expr: up{job="node"} == 0
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 10 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: InstanceClockDrift
expr: abs(node_time - time()) > 140
for: 5m
labels:
severity: info
annotations:
description: '{{ $labels.job }} has wrong clock setting'
summary: Instance {{ $labels.instance }} has clock drift
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
- name: gpu
rules:
- alert: UncorrectedECC
expr: eccuncorrectedvolatile_total{job="gpu"} > 0
for: 1m
labels:
severity: page
annotations:
description: '{{ $labels.instance }}: At least one GPU has uncorrectable ECC errors..'
summary: '{{ $labels.instance }} GPUEcc'
- name: custom
rules:
- alert: GangliaDown
expr: probe_success{instance="http://monitor.hpc.rug.nl/ganglia",job="httpblackbox"}
== 0
for: 10m
- alert: ProfilingGrafanaDown
expr: probe_success{instance="https://profiling.hpc.rug.nl",job="httpblackbox"}
== 0
for: 5m
- alert: Openstack03MemcachedUp
expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 1
for: 1m
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="httpblackbox"} - time() < 86400 * 30
for: 10m
- name: molgenis
rules:
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="molgenisblackbox"} - time() < 86400 * 30
for: 10m
- alert: HttpDown
expr: probe_success{job="molgenisblackbox"} == 0
for: 10m
# - alert: TestAlert
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
# for: 1m

49
roles/prom_server/templates/etc/datahandling.json

@ -0,0 +1,49 @@ @@ -0,0 +1,49 @@
[
{
"targets": [
"dh-node01:9100",
"dh-node02:9100",
"dh-node03:9100",
"dh-node04:9100",
"dh-node05:9100",
"dh-node06:9100",
"dh-node07:9100",
"dh-node08:9100",
"dh-node09:9100",
"dh-node11:9100",
"dh-node12:9100",
"dh-node12:9100",
"dh-node13:9100",
"dh-node14:9100",
"dh-node15:9100",
"dh-node16:9100",
"dh-node17:9100",
"dh-node18:9100",
"dh-node19:9100",
"dh-node20:9100",
"dh1-mds01:9100",
"dh1-mds02:9100",
"dh1-oss01:9100",
"dh1-oss02:9100",
"dh1-oss03:9100",
"dh1-oss04:9100",
"dh2-mds01:9100",
"dh2-mds02:9100",
"dh2-oss01:9100",
"dh2-oss02:9100",
"dh2-oss03:9100",
"dh2-oss04:9100",
"dh3-mds01:9100",
"dh3-mds02:9100",
"dh3-oss01:9100",
"dh3-oss02:9100",
"dh3-oss03:9100",
"dh3-oss04:9100"
],
"labels": {
"env": "datahandling",
"job": "node"
}
}
]

16
roles/prom_server/templates/etc/gpu_nodes.json

@ -0,0 +1,16 @@ @@ -0,0 +1,16 @@
[
{
"targets": [
"pg-gpu01:9101",
"pg-gpu02:9101",
"pg-gpu03:9101",
"pg-gpu04:9101",
"pg-gpu05:9101",
"pg-gpu06:9101"
],
"labels": {
"env": "peregrine",
"job": "gpu"
}
}
]

128
roles/prom_server/templates/etc/prometheus.yml

@ -0,0 +1,128 @@ @@ -0,0 +1,128 @@
$ANSIBLE_VAULT;1.1;AES256
39653130393437336565646131316263313333363463313135383139383964656233643137633562
3662363237313339336435623964326232646536306133630a306666666536633638386534363761
35613961663335396336623635613837623433663962306134343837323334343336336262646333
3834313832373365330a376234666633343731356433363265346533663261393463613933643232
64353463356637396538666630663361656330373134393831316138313939383066616264393939
39343431303562653530646165623733666661393930323365326665666333393331626362323865
31666664386162623938313861343263336432383063333632316638653162363762383933613637
62336232353165356662363339376639313166643166303231373562626538356662373562303730
36306332626164353432353862303263623162346535636163303436613964666139306532373930
65666233613039663263616139373935346164323236313339623866383638313464336534623538
34386338626330386435613766376635363237656638663439363232636131383930623566336339
31333038313662626163653638653162633138386331306531323236316561663437396630623130
31366162333661663736366430633364396237363131313361623536666463626632393138303039
34363033396465346333633534653962393864653136303361396462386535663034303663303864
30396263363336653966326230666565633436623962623361313663333663323230323963363532
65363561356338653833356331313631633362363762626435623734323537633931333532383836
39383632636563633032393639656530313035663732353562386363333361306533393230613733
61363638656265616632326130623336613164386534373163326434643635643039313566613133
63353466666535376264356635346164346566616134626363333865323464303731333664346462
64646138333731613033346232646137336432336233303531613337393031376361303333633465
39646230636338316261626263323132363266626530306134366638393763333862306235303838
36366334623830613534353534353364373866326630653036633161386236356636363563616166
35613039333634613265306138343563366337666532373331393466373566366239663865663362
31653337646435313431326632633634633836616362346164306265316432376130333139653565
38366639366335383461633464356439383932343764373361303661376136376631333861363836
30313866616631366337373736623464393738666435383532356163623838323230306562646661
66383163636438656431356336363734643062626463303731343035356139373936616635383535
31333932616338353864643938643464333836346532633264373237633038633638356466356565
39306238313836306634663839333839303438343939336363363233613838663365653364626538
65376435373936323534353263313439326362343366313238646164323861373766613230646463
63373633663663396539616264356133353730306462646532653637333231383131643233313464
66613063623164376361613664323963333263616331626361623166313331653030313366626638
62333235336165373433613437636134333835343831646166306235306562623266333734653162
39306365643265323162636532383461383863633761313534636635316365613064333865623930
66653433383437633937383835623066306166623437323834643434643432653164656537353933
34346437666463343436333732396530613061366165623033636362353330323430623732383233
66613863633563346433636636326636336563643033653364356366303036633130656263313462
32643533333737613963616164376161636162303736356664613561316431306661666537323334
38646161333731343435653730346664343630613064373662306238663839646333613638653035
62333063616432656233646434346334646664663134333561616333313638626632353032633938
36616338393434313432396664373961316535613239313465613262316535656435373735643764
66383838616265653535663536656332663838346631343537303433616666393536303965396263
64343039633830663366633633323237666565323930633539303964396234376431636465616664
63346261663937666566396161396339653031623337636661333861656235323063643363643230
66343965633935333730633834386431396537326662616234396635636330353562653635303962
38363166343636313935313035633863326263393330346464396533656137356531373066376334
30316534346233613839343663313266333466383931663537663131313530613232346665303066
34613465393831326431383363353165333964656561326162663261653135386139396665663866
30373232393166373363346339343335393238393462323338353161303335396333613963333734
37303031366237366333666639303337313366666665353061653561383064306637346638643331
34356135623862303763663062663733303562376134393137313935656131653062333561343261
62323138336134653637316134383364346337663539646531323632636164343231346432643261
63373461376264363332323530653632663863613237383735333963366635336538383339323161
35613162386164343163653231353134313166643033643735333438393534373363653366653533
34656666303739376362323061636161376535663338336433333630376632346439643034653133
32623937363236373832616632383332613032633233316433646664316537613635383636643662
36363462393466363036396534653339326163326537656433363663373533656530323636613262
34633461663663343331333365356134393066306638303437333365626139363033316232653232
36303637386139656337323663656435626434383461376161366330363865663333356632336236
62376539346364353935346634646163316337636161306433396436336266303836323766346432
33663331303932333239663132376535356436623234376431656438646136666261343734363362
63353936386661343734636566343430623965613463333765373033666534613538646565663639
36353032346636616537613434376537626239346330383030363739386233373634643931313738
32653736346137656639646366333733326130663339366666633930356164376365383633623064
61383765643361613936646363376561363361633031316338633166363730303339313736643431
62346231376361616132356466353261373761626534366361393566633336346438333266633462
36333361313661363464666634326461356636353030333136623366343739656237376261313866
66666364306465643532336538653339396333666235333465616461323533313235306263616630
37666565383139353537313535333261303131653130356636386364353231626562623236306361
31343261663838343633316534653031626233356666633733656661373964666131663332366564
30386665623432343363633863343139626333383236353663383566323437616532386465633731
32363238613865346632643338313866663961346230346130373135393635313235336139383235
63393834373962343966346437336433376239303737326338636237303261326336316232613637
33343233653537646132333932346261343135373035376464303331613235623163613864646565
30303233323763363138303064643537643730646131633465303437663135653936343030303066
38336566366163306438383866633164363266633863353339636530626364373336633133343366
31303138306438623864626635663466643031663062656238323738363732303762663762643539
33363034663230626132633338663434396339643438393164653337353331303163323634646436
66323432303530373337346366376466633439383235656537313162346239313964323437643333
62343164393836646265323266393338346634353936666230313236343065613535616435366337
33353735373033323964313861636534386561396236326666653766666261363465333838393765
62363237323833386665626334393963336134666439343466353964643861363265663465363162
34366464363962326366633930343634646536366533653966313133313161363763363539363639
36313963333734316163383932323730626132303931366632326136386438326166643639353331
62376462323830633338613337663165356634336139393839363134633735396636653161333165
32316466383634613163663235636262383465383737376364353535623033663861333935316230
33663735373835376463626238643936646466626465616139366665663966306132356663646239
34333731396230376334663934313430333964663366346236383535383134343263303464626565
61643664626138353365353966326233656363663665316332316135373761613433313164356138
34396563353939346661623032663731323830383766356465643637623266303739613137343264
63366538393461323764636236333038306335666265316633376663353031336565633431316166
36353563323230366163633166323763636533623766363336336165326138613934343639656463
34313061383135333065373938376137363633313137313866353764393361666364623334613236
36343933303332303836333537636532633539626663616132346437666434373461646266303336
64353930363237363938393466393664613032396364323864613339363535306636346132323666
30316136313936353861666632356263333836396435643138376365303336613232393433306166
34613862313264656661353262336130326235383330343130363332656232303666323134313938
32366663316165306632366637616533303033363965366461373230666265633435346537636630
37653938653834623565656466393937623363313935383933626662646230663661656133326130
31326530313361626534353064633639303061633663373636306564333063636666656331333365
35613032306634626436346665303066323130393239343235656562656533356565333061366331
64666335623965363630633531646464336163353333336132313563633936313738333733633638
34346539353762373833333466636132303761313231636432663363326331663536653965393438
37616632363664333336666233353864323066373232663432646637613836333462396466643030
34626130626630343466346436613634636136393830366237643532366363636432353333323130
36366565613163363065626466343263613964316162353639643731313764323935373738346563
63623138393839323462366263363131316332663732373062323336633163393830343335366439
65636235313463613562343134646665646230343961376662326637613062306462663137616163
33333938393033373062316161336431353761373961346462326137636264383338363965353364
31386539376531616239616338393434336134653566303161323462376364633163663639323966
34323765366133663666333265663230336366393764323135393132623966626263353362363434
31663336633534646362613365326138306166353061366530383362343866373761343063663962
62636562633863353135343932626362366563313466386135333035303339363339383834646664
37396466663634343161363366653132326264373866656130343263643765356135656531333337
37323462306665326533383434656538313564663232353939336463326163323531373161613833
62633138353662333565663261643436393538643133356436636363393063383164666430633965
66343730383534343038396238343266396639323661383363393863663335643035316231313035
64303832393463616165386330366137343530366531303061393166646162663565333334363563
39373933373464646132356466626233613430306639353933613065363861393861643134633639
66363264666336626463653235343731396133346431383035663064393437646130333230313236
65306563326333383837623936323333343063376238616235386337663737663530643464353639
39323130643064323939383065666130303230343037663739646537393330373836656561343832
65343163613730343062346165386463313765363164643231623563386530343737646264333064
37336233633261306538373731383431633933383734666362386530326564343839336437353036
66613566336230386165356432323963666538343134303536643933643065623463633733626663
30366535336536666431653134303038313666646364313832636565613430303731646437333033
363333326435613762646437323731643761

237
roles/prom_server/templates/etc/targets.json

@ -0,0 +1,237 @@ @@ -0,0 +1,237 @@
[
{
"targets": [
"pg-node132:9100",
"pg-node069:9100",
"pg-node068:9100",
"pg-node083:9100",
"pg-node139:9100",
"pg-node153:9100",
"pg-node178:9100",
"pg-node076:9100",
"pg-node194:9100",
"pg-node119:9100",
"pg-node128:9100",
"pg-node166:9100",
"pg-node056:9100",
"pg-gpu03:9100",
"pg-node041:9100",
"pg-node100:9100",
"pg-node116:9100",
"pg-node175:9100",
"pg-node062:9100",
"pg-node027:9100",
"pg-node150:9100",
"pg-node046:9100",
"pg-node165:9100",
"pg-mds01.hpc.local:9100",
"pg-node058:9100",
"pg-node152:9100",
"pg-node208:9100",
"pg-node065:9100",
"pg-node003:9100",
"pg-node124:9100",
"pg-node203:9100",
"pg-gpu01:9100",
"pg-node059:9100",
"pg-node126:9100",
"pg-node101:9100",
"pg-node084:9100",
"pg-gpu06:9100",
"pg-node145:9100",
"pg-node157:9100",
"pg-node009:9100",
"pg-node045:9100",
"pg-node197:9100",
"pg-node044:9100",
"pg-node024:9100",
"pg-node151:9100",
"pg-gpu02:9100",
"pg-node033:9100",
"pg-node125:9100",
"pg-node010:9100",
"pg-node121:9100",
"pg-node077:9100",
"pg-node136:9100",
"pg-node112:9100",
"pg-node055:9100",
"pg-node070:9100",
"pg-node054:9100",
"pg-node199:9100",
"pg-node007:9100",
"pg-node047:9100",
"pg-node185:9100",
"pg-node105:9100",
"pg-node127:9100",
"pg-node200:9100",
"pg-node201:9100",
"pg-ost04.hpc.local:9100",
"pg-node106:9100",
"pg-node118:9100",
"pg-node131:9100",
"pg-node169:9100",
"pg-node179:9100",
"pg-node096:9100",
"pg-node120:9100",
"pg-node002:9100",
"pg-node094:9100",
"pg-node008:9100",
"pg-node074:9100",
"pg-node140:9100",
"pg-node072:9100",
"pg-node202:9100",
"pg-node021:9100",
"pg-node186:9100",
"pg-node159:9100",
"pg-node025:9100",
"pg-node057:9100",
"pg-node073:9100",
"pg-node092:9100",
"pg-node180:9100",
"pg-node081:9100",
"pg-node148:9100",
"pg-node209:9100",
"pg-node170:9100",
"pg-node113:9100",
"pg-node090:9100",
"pg-node115:9100",
"pg-node053:9100",
"pg-node018:9100",
"pg-node023:9100",
"pg-node123:9100",
"pg-node162:9100",
"pg-node107:9100",
"pg-node060:9100",
"pg-node029:9100",
"pg-node155:9100",
"pg-node143:9100",
"pg-node171:9100",
"pg-node019:9100",
"pg-node080:9100",
"pg-node182:9100",
"pg-node164:9100",
"pg-node196:9100",
"pg-node110:9100",
"pg-node154:9100",
"pg-node144:9100",
"pg-node039:9100",
"pg-node038:9100",
"pg-node050:9100",
"pg-node111:9100",
"pg-node204:9100",
"pg-node015:9100",
"pg-node087:9100",
"pg-node184:9100",
"pg-node177:9100",
"pg-node176:9100",
"pg-node192:9100",
"pg-node040:9100",
"pg-node067:9100",
"pg-ost03.hpc.local:9100",
"pg-node049:9100",
"pg-node206:9100",
"pg-node103:9100",
"pg-node034:9100",
"pg-node020:9100",
"pg-node198:9100",
"pg-node168:9100",
"pg-node102:9100",
"pg-gpu05:9100",
"pg-node095:9100",
"pg-node142:9100",
"pg-node172:9100",
"pg-node133:9100",
"pg-node075:9100",
"pg-node104:9100",
"pg-node005:9100",
"pg-node138:9100",
"pg-node031:9100",
"pg-node108:9100",
"pg-node001:9100",
"pg-node097:9100",
"pg-node117:9100",
"pg-node160:9100",
"pg-node173:9100",
"pg-node052:9100",
"pg-node032:9100",
"pg-memory06:9100",
"pg-node026:9100",
"pg-node064:9100",
"pg-node014:9100",
"pg-node190:9100",
"pg-node181:9100",
"pg-node158:9100",
"pg-node016:9100",
"pg-node086:9100",
"pg-node085:9100",
"pg-node089:9100",
"pg-node195:9100",
"pg-node078:9100",
"pg-node135:9100",
"pg-node071:9100",
"pg-node082:9100",
"pg-node091:9100",
"pg-ost01.hpc.local:9100",
"pg-node161:9100",
"pg-node028:9100",
"pg-memory04:9100",
"pg-node137:9100",
"pg-node183:9100",
"pg-node051:9100",
"pg-node189:9100",
"pg-node006:9100",
"pg-memory05:9100",
"pg-node093:9100",
"pg-gpu04:9100",
"pg-mds02.hpc.local:9100",
"pg-memory01:9100",
"pg-node191:9100",
"pg-node037:9100",
"pg-node114:9100",
"pg-ost02.hpc.local:9100",
"pg-node210:9100",
"pg-node004:9100",
"pg-node043:9100",
"pg-node079:9100",
"pg-node149:9100",
"pg-node167:9100",
"pg-node022:9100",
"pg-node066:9100",
"pg-node013:9100",
"pg-node063:9100",
"pg-node130:9100",
"pg-memory02:9100",
"pg-memory07:9100",
"pg-node109:9100",
"pg-node134:9100",
"pg-node099:9100",
"pg-node187:9100",
"pg-node174:9100",
"pg-node017:9100",
"pg-node098:9100",
"pg-node141:9100",
"pg-node188:9100",
"pg-node011:9100",
"pg-node042:9100",
"pg-node147:9100",
"pg-node193:9100",
"pg-node163:9100",
"pg-node207:9100",
"pg-node156:9100",
"pg-node146:9100",
"pg-node048:9100",
"pg-node036:9100",
"pg-node088:9100",
"pg-node035:9100",
"pg-node012:9100",
"pg-node205:9100",
"pg-node061:9100",
"pg-node030:9100",
"pg-node122:9100"
],
"labels": {
"env": "peregrine",
"job": "node"
}
}
]

19
roles/prom_server/templates/prometheus.service

@ -0,0 +1,19 @@ @@ -0,0 +1,19 @@
[Unit]
Description=Prometheus monitoring
After=docker.service
Requires=docker.service
[Service]
TimeoutStartSec=0
Restart=always
ExecStartPre=-/usr/bin/docker kill %n
ExecStartPre=-/usr/bin/docker rm %n
ExecStart=/usr/bin/docker run --name %n \
--network host \
-v /srv/prometheus/prometheus:/prometheus \
-v /srv/prometheus/etc/prometheus:/etc/prometheus \
prom/prometheus:v2.2.1 \
--storage.tsdb.retention 7d --config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus --web.enable-lifecycle
[Install]
WantedBy=multi-user.target
Loading…
Cancel
Save