Prometheus Ansible
Prometheus Ansible
Prometheus ansible jinja2 template prometheus.yml.j2
# ANSIBLE MANAGED
global:
scrape_interval: 30s
scrape_timeout: 10s
evaluation_interval: 30s
external_labels:
replica: {{ ansible_host }}
rule_files:
- alert-rules.yml
alerting:
alert_relabel_configs:
- source_labels: [replica]
regex: .*
target_label: replica
replacement: {{ site }}
alertmanagers:
- static_configs:
- targets:
{% for item in groups['prometheus_servers'] %}
- {{ item }}:9093
{% endfor %}
scheme: http
timeout: 10s
api_version: v1
scrape_configs:
# node-exporter
- job_name: linux-servers
honor_timestamps: true
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- _{{ site }}-prom-linux._tcp.abc.local
- _{{ site }}-prom-linux._tcp.abc.local
refresh_interval: 60s
# process-exporter
- job_name: 'lb_services'
static_configs:
- targets:
- {{ site }}lb01.abc.local:9256
- {{ site }}lb02.abc.local:9256
- job_name: prometheus
honor_timestamps: true
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- localhost:9090
# wmi-exporter / windows-exporter
- job_name: windows-servers
honor_timestamps: true
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- _{{ site }}-prom-win._tcp.abc.local
refresh_interval: 60s
alert-rules.yml.j2
# ANSIBLE MANAGED
groups:
- name: General
rules:
- alert: Exporter/Server Offline
expr: "up{instance!~'.*:9256'} == 0"
for: 5m
labels:
severity: critical
slack: true
annotations:
description: "{% raw %}{{ $labels.instance }} has not been able to be scraped for more than 5 minutes. Check Exporter is running and Server is online.{% endraw %}"
summary: "{% raw %}{{ $labels.instance }} not available{% endraw %}"
- name: Windows
rules:
- alert: WINDOWS SERVICE - The Splunk Forwarder Service on App servers is not RUNNING
expr: "windows_service_state{instance=~'.*app.*',name='splunkforwarder',state='running'} == 0"
for: 5m
labels:
severity: warning
slack: true
annotations:
description: "{% raw %}Splunk Forwarder Service on {{ $labels.instance }} has been down for more than 5 minutes.{% endraw %}"
summary: "{% raw %}Splunk Forwarder on {{ $labels.instance }} is not running{% endraw %}"
roles/handlers/main.yml
---
- name: restart alertmanager
docker_container:
name: alert-manager
state: started
restart: yes
- name: restart prometheus
docker_container:
name: prometheus
state: started
restart: yes
roles/linux_prometheus/files/slack-template.yml
{{ define "custom_slack_title" }}
[{{ .Status | toUpper -}}
{{- if eq .Status "firing" }}:
{{- .Alerts.Firing | len }}] :rageface: {{ .CommonLabels.alertname }}
{{ end }}
{{- if eq .Status "resolved" }}:
{{- .Alerts.Resolved | len }}] :arnold: {{ .CommonLabels.alertname }}
{{ end }}
{{ end }}
{{ define "custom_slack_message" }}
{{ range .Alerts.Firing }}
{{ if .Annotations.summary }}{{ .Annotations.summary }}{{ end }}
{{ if .Labels.severity }}*Severity* - `{{ .Labels.severity }}`{{ end }}
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
{{ range .Alerts.Resolved }}
{{ if .Annotations.summary }}*Resolved Alerts* - {{ .Annotations.summary -}}
{{ end }}
{{ end}}
{{ end }}
roles/linux_prometheus/tasks/main.yml
- name: Create a volume group on /dev/sdb for Prometheus
lvg:
vg: vg.prometheus
pvs: /dev/sdb
register: vg_create
- name: Create a logical volume for Prometheus
lvol:
vg: vg.prometheus
lv: prometheus
size: 100%FREE
when: vg_create is changed
- name: Create a filesystem
filesystem:
fstype: ext4
dev: /dev/vg.prometheus/prometheus
when: vg_create is changed
- name: Create Prometheus directory
file:
path: /data/prometheus
state: directory
- name: Mount Prometheus volume
mount:
path: /data/prometheus
src: /dev/vg.prometheus/prometheus
fstype: ext4
state: mounted
- name: Create supporting directories
file:
path: "/data/prometheus/{{ item }}"
state: directory
owner: "1000"
group: "1000"
loop:
- alertmanager
- configs
- templates
- tsdb
- name: Copy prometheus.yml config file
template:
dest: /data/prometheus/configs/prometheus.yml
src: prometheus.yml.j2
owner: "1000"
group: "1000"
notify: restart prometheus
- name: Copy alert-rules.yml config file
template:
dest: /data/prometheus/configs/alert-rules.yml
src: alert-rules.yml.j2
owner: "1000"
group: "1000"
notify: restart prometheus
- name: Copy alertmanager.yml config file
template:
dest: /data/prometheus/configs/alertmanager.yml
src: alertmanager.yml.j2
owner: "1000"
group: "1000"
notify: restart alertmanager
- name: Copy template files
copy:
src: "{{ item }}"
dest: "/data/prometheus/templates/{{ item }}"
owner: "1000"
group: "1000"
loop:
- slack-template.tmpl
notify: restart alertmanager
- name: Install docker python module using pip
pip:
name: docker
- name: Create prom-net Docker network
docker_network:
name: prom-net
- name: Check Prometheus config file
docker_container:
name: prom-check-config
image: prom/prometheus:#{prometheusVersion}
detach: false
# container_default_behavior: no_defaults
entrypoint:
- /bin/promtool
command: "check config /etc/prometheus/prometheus.yml"
networks:
- name: prom-net
user: 1000:1000
volumes:
- /data/prometheus/configs/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/configs/alert-rules.yml:/etc/prometheus/alert-rules.yml
- name: Check Alertmanager config file
docker_container:
name: alertmanager-check-config
image: prom/alertmanager:#{alertManagerVersion}
detach: false
# container_default_behavior: no_defaults
entrypoint:
- /bin/amtool
command: "check-config /etc/prometheus/alertmanager.yml"
networks:
- name: prom-net
user: 1000:1000
volumes:
- /data/prometheus/configs/alertmanager.yml:/etc/prometheus/alertmanager.yml
- name: Create a Prometheus Container
docker_container:
name: prometheus
image: prom/prometheus:#{prometheusVersion}
# container_default_behavior: no_defaults
command: "--config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/data --web.enable-lifecycle"
networks:
- name: prom-net
networks_cli_compatible: yes
published_ports:
- 9090:9090
restart_policy: always
state: started
user: 1000:1000
volumes:
- /data/prometheus/configs/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/configs/alert-rules.yml:/etc/prometheus/alert-rules.yml
- /data/prometheus/tsdb:/data
- name: Create a Alert Manager Container
docker_container:
name: alert-manager
image: prom/alertmanager:#{alertManagerVersion}
command: "--cluster.peer={{ alert_manager_peer }}:9094 --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager"
networks:
- name: prom-net
networks_cli_compatible: yes
published_ports:
- 9093:9093
- 9094:9094
restart_policy: always
state: started
user: 1000:1000
volumes:
- /data/prometheus/alertmanager:/alertmanager
- /data/prometheus/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /data/prometheus/templates:/etc/alertmanager/templates
- name: Create node-exporter Container
docker_container:
name: node-exporter
image: prom/node-exporter:#{nodeExporterVersion}
networks:
- name: prom-net
networks_cli_compatible: yes
published_ports:
- 9100:9100
restart_policy: always
state: started
- name: Ensure firewalld is configured
firewalld:
port: "{{ item }}"
permanent: true
immediate: true
state: enabled
loop:
- 9090/tcp
- 9093/tcp
- 9100/tcp
- 10901/tcp
- 19191/tcp
prometheus playbook
---
- name: Install and Config Prometheus
hosts: prometheus_servers
become: yes
roles:
- linux_prometheus