Prometheus Ansible

Prometheus ansible jinja2 template prometheus.yml.j2


# ANSIBLE MANAGED
global:
  scrape_interval: 30s
  scrape_timeout: 10s
  evaluation_interval: 30s

  external_labels:
    replica: {{ ansible_host }}

rule_files:
  - alert-rules.yml

alerting:
  alert_relabel_configs:
  - source_labels: [replica]
    regex: .*
    target_label: replica
    replacement: {{ site }}

  alertmanagers:
  - static_configs:
    - targets:
{% for item in groups['prometheus_servers'] %}
      - {{ item }}:9093
{% endfor %}
    scheme: http
    timeout: 10s
    api_version: v1

scrape_configs:

# node-exporter      
- job_name: linux-servers
  honor_timestamps: true
  metrics_path: /metrics
  scheme: http
  dns_sd_configs:
  - names:
      - _{{ site }}-prom-linux._tcp.abc.local
      - _{{ site }}-prom-linux._tcp.abc.local
    refresh_interval: 60s

# process-exporter
- job_name: 'lb_services'
  static_configs:
  - targets:
    - {{ site }}lb01.abc.local:9256
    - {{ site }}lb02.abc.local:9256

- job_name: prometheus
  honor_timestamps: true
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - localhost:9090

# wmi-exporter / windows-exporter
- job_name: windows-servers
  honor_timestamps: true
  metrics_path: /metrics
  scheme: http
  dns_sd_configs:
  - names:
      - _{{ site }}-prom-win._tcp.abc.local
    refresh_interval: 60s

alert-rules.yml.j2


# ANSIBLE MANAGED
groups:

- name: General
  rules:
    - alert: Exporter/Server Offline
      expr: "up{instance!~'.*:9256'} == 0"
      for: 5m
      labels:
        severity: critical
        slack: true
      annotations:
        description: "{% raw %}{{ $labels.instance }} has not been able to be scraped for more than 5 minutes. Check Exporter is running and Server is online.{% endraw %}"
        summary: "{% raw %}{{ $labels.instance }} not available{% endraw %}"
        
- name: Windows
  rules:
    - alert: WINDOWS SERVICE - The Splunk Forwarder Service on App servers is not RUNNING
      expr: "windows_service_state{instance=~'.*app.*',name='splunkforwarder',state='running'} == 0"
      for: 5m
      labels:
        severity: warning
        slack: true
      annotations:
        description: "{% raw %}Splunk Forwarder Service on {{ $labels.instance }} has been down for more than 5 minutes.{% endraw %}"
        summary: "{% raw %}Splunk Forwarder on {{ $labels.instance }} is not running{% endraw %}"

roles/handlers/main.yml



---
- name: restart alertmanager
  docker_container:
    name: alert-manager
    state: started
    restart: yes

- name: restart prometheus
  docker_container:
    name: prometheus
    state: started
    restart: yes

roles/linux_prometheus/files/slack-template.yml

 

{{ define "custom_slack_title" }}
  [{{ .Status | toUpper -}}
    {{- if eq .Status "firing" }}:
        {{- .Alerts.Firing | len }}] :rageface: {{ .CommonLabels.alertname }}
    {{ end }}
    {{- if eq .Status "resolved" }}:
        {{- .Alerts.Resolved | len }}] :arnold: {{ .CommonLabels.alertname }}
    {{ end }}
{{ end }}

{{ define "custom_slack_message" }}
    {{ range .Alerts.Firing }}
    {{ if .Annotations.summary }}{{ .Annotations.summary }}{{ end }}

    {{ if .Labels.severity }}*Severity* - `{{ .Labels.severity }}`{{ end }}

    *Description:* {{ .Annotations.description }}

    *Details:*
      {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
      {{ end }}
    {{ end }}

    {{ range .Alerts.Resolved }}
    {{ if .Annotations.summary }}*Resolved Alerts* - {{ .Annotations.summary -}}
    {{ end }}
    {{ end}}
{{ end }}

roles/linux_prometheus/tasks/main.yml

 


- name: Create a volume group on /dev/sdb for Prometheus
  lvg:
    vg: vg.prometheus
    pvs: /dev/sdb
  register: vg_create

- name: Create a logical volume for Prometheus
  lvol:
    vg: vg.prometheus
    lv: prometheus
    size: 100%FREE
  when: vg_create is changed

- name: Create a filesystem
  filesystem:
    fstype: ext4
    dev: /dev/vg.prometheus/prometheus
  when: vg_create is changed

- name: Create Prometheus directory
  file:
    path: /data/prometheus
    state: directory

- name: Mount Prometheus volume
  mount:
    path: /data/prometheus
    src: /dev/vg.prometheus/prometheus
    fstype: ext4
    state: mounted

- name: Create supporting directories
  file:
    path: "/data/prometheus/{{ item }}"
    state: directory
    owner: "1000"
    group: "1000"
  loop:
    - alertmanager
    - configs
    - templates
    - tsdb

- name: Copy prometheus.yml config file
  template:
    dest: /data/prometheus/configs/prometheus.yml
    src: prometheus.yml.j2
    owner: "1000"
    group: "1000"
  notify: restart prometheus

- name: Copy alert-rules.yml config file
  template:
    dest: /data/prometheus/configs/alert-rules.yml
    src: alert-rules.yml.j2
    owner: "1000"
    group: "1000"
  notify: restart prometheus

- name: Copy alertmanager.yml config file
  template:
    dest: /data/prometheus/configs/alertmanager.yml
    src: alertmanager.yml.j2
    owner: "1000"
    group: "1000"
  notify: restart alertmanager

- name: Copy template files
  copy:
    src: "{{ item }}"
    dest: "/data/prometheus/templates/{{ item }}"
    owner: "1000"
    group: "1000"
  loop:
    - slack-template.tmpl
  notify: restart alertmanager

- name: Install docker python module using pip
  pip:
    name: docker

- name: Create prom-net Docker network
  docker_network:
    name: prom-net

- name: Check Prometheus config file
  docker_container:
    name: prom-check-config
    image: prom/prometheus:#{prometheusVersion}
    detach: false
  # container_default_behavior: no_defaults
    entrypoint:
      - /bin/promtool
    command: "check config /etc/prometheus/prometheus.yml"
    networks:
      - name: prom-net
    user: 1000:1000
    volumes:
      - /data/prometheus/configs/prometheus.yml:/etc/prometheus/prometheus.yml
      - /data/prometheus/configs/alert-rules.yml:/etc/prometheus/alert-rules.yml

- name: Check Alertmanager config file
  docker_container:
    name: alertmanager-check-config
    image: prom/alertmanager:#{alertManagerVersion}
    detach: false
  # container_default_behavior: no_defaults
    entrypoint:
      - /bin/amtool
    command: "check-config /etc/prometheus/alertmanager.yml"
    networks:
      - name: prom-net
    user: 1000:1000
    volumes:
      - /data/prometheus/configs/alertmanager.yml:/etc/prometheus/alertmanager.yml

- name: Create a Prometheus Container
  docker_container:
    name: prometheus
    image: prom/prometheus:#{prometheusVersion}
  # container_default_behavior: no_defaults
    command: "--config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/data --web.enable-lifecycle"
    networks:
      - name: prom-net
    networks_cli_compatible: yes
    published_ports:
      - 9090:9090
    restart_policy: always
    state: started
    user: 1000:1000
    volumes:
      - /data/prometheus/configs/prometheus.yml:/etc/prometheus/prometheus.yml
      - /data/prometheus/configs/alert-rules.yml:/etc/prometheus/alert-rules.yml
      - /data/prometheus/tsdb:/data



- name: Create a Alert Manager Container
  docker_container:
    name: alert-manager
    image: prom/alertmanager:#{alertManagerVersion}
    command: "--cluster.peer={{ alert_manager_peer }}:9094 --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager"
    networks:
      - name: prom-net
    networks_cli_compatible: yes
    published_ports:
      - 9093:9093
      - 9094:9094
    restart_policy: always
    state: started
    user: 1000:1000
    volumes:
        - /data/prometheus/alertmanager:/alertmanager
        - /data/prometheus/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml
        - /data/prometheus/templates:/etc/alertmanager/templates


    
- name: Create node-exporter Container
  docker_container:
    name: node-exporter
    image: prom/node-exporter:#{nodeExporterVersion}
    networks:
      - name: prom-net
    networks_cli_compatible: yes
    published_ports:
      - 9100:9100
    restart_policy: always
    state: started


- name: Ensure firewalld is configured
  firewalld:
    port: "{{ item }}"
    permanent: true
    immediate: true
    state: enabled
  loop:
    - 9090/tcp
    - 9093/tcp
    - 9100/tcp
    - 10901/tcp
    - 19191/tcp

prometheus playbook


---
- name: Install and Config Prometheus
  hosts: prometheus_servers
  become: yes

  roles:
    - linux_prometheus