Add K8s specific files

518d2f41 · Alex Ellis · ddc973c3 · 518d2f41 · 518d2f41 · 518d2f41
Commit 518d2f41 authored 7 years ago by Alex Ellis
--- a/prometheus/Dockerfile.alertmanager.k8s
+++ b/prometheus/Dockerfile.alertmanager.k8s
+FROM prom/alertmanager:v0.7.1
+
+COPY k8s.alertmanager.yml /alertmanager.yml
--- a/prometheus/Dockerfile.prometheus.k8s
+++ b/prometheus/Dockerfile.prometheus.k8s
+FROM prom/prometheus:v1.5.2
+
+COPY k8s.prometheus.yml /etc/prometheus/prometheus.yml
+COPY k8s.alert.rules /etc/prometheus/alert.rules
+
--- a/prometheus/k8s.alert.rules
+++ b/prometheus/k8s.alert.rules
+ALERT service_down
+  IF up == 0
+
+ALERT APIHighInvocationRate
+  IF sum ( rate(gateway_function_invocation_total{code="200"}[10s]) ) by (function_name) > 5 
+  FOR 5s
+  LABELS {
+    service = "gateway",
+    severity = "major",
+    value = "{{$value}}"
+  }
+  ANNOTATIONS {
+    summary = "High invocation total on {{ $labels.instance }}",
+    description =  "High invocation total on {{ $labels.instance }}"
+  } 
+
--- a/prometheus/k8s.alertmanager.yml
+++ b/prometheus/k8s.alertmanager.yml
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@example.org'
+  smtp_auth_username: 'alertmanager'
+  smtp_auth_password: 'password'
+  # The auth token for Hipchat.
+  hipchat_auth_token: '1234556789'
+  # Alternative host for Hipchat.
+  hipchat_url: 'https://hipchat.foobar.org/'
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/alertmanager/template/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 5s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 10s
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 30s 
+
+  # A default receiver
+  receiver: scale-up
+
+  # All the above attributes are inherited by all child routes and can 
+  # overwritten on each.
+
+  # The child route trees.
+  routes:
+  - match:
+      service: gateway
+      receiver: scale-up
+      severity: major
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+receivers:
+- name: 'scale-up'
+  webhook_configs:
+    - url: http://gateway.default:8080/system/alert
+      send_resolved: true
+
--- a/prometheus/k8s.prometheus.yml
+++ b/prometheus/k8s.prometheus.yml
+# my global config
+global:
+  scrape_interval:     15s # By default, scrape targets every 15 seconds.
+  evaluation_interval: 15s # By default, scrape targets every 15 seconds.
+  # scrape_timeout is set to the global default (10s).
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: 'faas-monitor'
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+    - 'alert.rules'
+
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: 'prometheus'
+
+    # Override the global default and scrape targets from this job every 5 seconds.
+    scrape_interval: 5s
+
+    # metrics_path defaults to '/metrics'
+    # scheme defaults to 'http'.
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: "gateway"
+    scrape_interval: 5s
+    dns_sd_configs:
+      - names: ['gateway.default']
+        port: 8080
+        type: A
+        refresh_interval: 5s
+
+
+