Skip to content

Commit

Permalink
Merge pull request #286 from techeer-sv/BE/#272
Browse files Browse the repository at this point in the history
Be/#272 Prometheus에 Slack 알림 alertmanager 연동
  • Loading branch information
baekhangyeol authored Nov 9, 2023
2 parents ba0a6b8 + 938ad29 commit 2327e10
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 1 deletion.
13 changes: 13 additions & 0 deletions alertmanager/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
global:
slack_api_url: 'https://hooks.slack.com/services/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'

route:
receiver: 'slack-notifications'
repeat_interval: 2m
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#_monitoring'
send_resolved: true
title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}"
text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
14 changes: 14 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ services:
- ./prometheus/data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
links:
- alertmanager:alertmanager
depends_on:
- springboot

Expand All @@ -78,3 +80,15 @@ services:
depends_on:
- prometheus
- springboot

alertmanager:
image: prom/alertmanager
container_name: alert-manager
ports:
- 9093:9093
volumes:
- ./alertmanager/:/etc/alertmanager/
restart: always
command:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager"
48 changes: 48 additions & 0 deletions prometheus/alert.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
groups:
- name: alert.rules
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: "critical"
annotations:
summary: "Endpoint {{ $labels.instance }}"
identifier: "{{ $labels.instance }}"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."

- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: "Host out of memory (instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Host out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: "Host high CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
4 changes: 3 additions & 1 deletion prometheus/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ global:

alerting:
alertmanagers:
- scheme: http
- static_configs:
- targets:
- targets: ["alertmanager:9093"]

rule_files:
- 'alert.rules'

scrape_configs:
- job_name: "prometheus"
Expand Down

0 comments on commit 2327e10

Please sign in to comment.