diff --git a/prometheus/Dockerfile.alertmanager.k8s b/prometheus/Dockerfile.alertmanager.k8s new file mode 100644 index 0000000000000000000000000000000000000000..5a2574e8e7803aae20aada1075d3bf7a28ffe031 --- /dev/null +++ b/prometheus/Dockerfile.alertmanager.k8s @@ -0,0 +1,3 @@ +FROM prom/alertmanager:v0.7.1 + +COPY k8s.alertmanager.yml /alertmanager.yml diff --git a/prometheus/Dockerfile.prometheus.k8s b/prometheus/Dockerfile.prometheus.k8s new file mode 100644 index 0000000000000000000000000000000000000000..bd86340aa868e72a6111b7c8b1d33bccb8670809 --- /dev/null +++ b/prometheus/Dockerfile.prometheus.k8s @@ -0,0 +1,5 @@ +FROM prom/prometheus:v1.5.2 + +COPY k8s.prometheus.yml /etc/prometheus/prometheus.yml +COPY k8s.alert.rules /etc/prometheus/alert.rules + diff --git a/prometheus/k8s.alert.rules b/prometheus/k8s.alert.rules new file mode 100644 index 0000000000000000000000000000000000000000..10d5b7cd6548d2b98442ad726210dd2b53879de9 --- /dev/null +++ b/prometheus/k8s.alert.rules @@ -0,0 +1,16 @@ +ALERT service_down + IF up == 0 + +ALERT APIHighInvocationRate + IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5 + FOR 5s + LABELS { + service = "gateway", + severity = "major", + value = "{{$value}}" + } + ANNOTATIONS { + summary = "High invocation total on {{ $labels.instance }}", + description = "High invocation total on {{ $labels.instance }}" + } + diff --git a/prometheus/k8s.alertmanager.yml b/prometheus/k8s.alertmanager.yml new file mode 100644 index 0000000000000000000000000000000000000000..d68a0fd55faf0a4c38a764ac498254454b1a898a --- /dev/null +++ b/prometheus/k8s.alertmanager.yml @@ -0,0 +1,69 @@ +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@example.org' + smtp_auth_username: 'alertmanager' + smtp_auth_password: 'password' + # The auth token for Hipchat. + hipchat_auth_token: '1234556789' + # Alternative host for Hipchat. + hipchat_url: 'https://hipchat.foobar.org/' + +# The directory from which notification templates are read. +templates: +- '/etc/alertmanager/template/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 5s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 10s + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 30s + + # A default receiver + receiver: scale-up + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + - match: + service: gateway + receiver: scale-up + severity: major + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + +receivers: +- name: 'scale-up' + webhook_configs: + - url: http://gateway.default:8080/system/alert + send_resolved: true + diff --git a/prometheus/k8s.prometheus.yml b/prometheus/k8s.prometheus.yml new file mode 100644 index 0000000000000000000000000000000000000000..7d182c372f719248f7d3b3225b3a8f7443d497b6 --- /dev/null +++ b/prometheus/k8s.prometheus.yml @@ -0,0 +1,40 @@ +# my global config +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'faas-monitor' + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - 'alert.rules' + + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['localhost:9090'] + + - job_name: "gateway" + scrape_interval: 5s + dns_sd_configs: + - names: ['gateway.default'] + port: 8080 + type: A + refresh_interval: 5s + + +