From 4b2b931eb835e7d7503a6cbe9e1c7ba52c9a1d4a Mon Sep 17 00:00:00 2001 From: decentral1se Date: Thu, 17 Mar 2022 10:18:32 +0100 Subject: [PATCH] init --- .gitignore | 1 + README.md | 3 + gathering/.dont-delete-me | 1 + monitoring/.env.sample | 29 + monitoring/README.md | 29 + monitoring/abra.sh | 9 + monitoring/compose.yml | 129 ++ monitoring/grafana-dashboards.yml | 13 + monitoring/grafana-datasources.yml | 10 + monitoring/grafana-stacks-dashboard.json | 745 +++++++++ monitoring/grafana-swarm-dashboard.json | 1729 +++++++++++++++++++++ monitoring/grafana-traefik-dashboard.json | 1536 ++++++++++++++++++ monitoring/loki.yml.tmpl | 71 + monitoring/node-exporter-entrypoint.sh | 9 + monitoring/prometheus.yml.tmpl | 58 + monitoring/promtail.yml | 18 + 16 files changed, 4390 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 gathering/.dont-delete-me create mode 100644 monitoring/.env.sample create mode 100644 monitoring/README.md create mode 100644 monitoring/abra.sh create mode 100644 monitoring/compose.yml create mode 100644 monitoring/grafana-dashboards.yml create mode 100644 monitoring/grafana-datasources.yml create mode 100644 monitoring/grafana-stacks-dashboard.json create mode 100644 monitoring/grafana-swarm-dashboard.json create mode 100644 monitoring/grafana-traefik-dashboard.json create mode 100644 monitoring/loki.yml.tmpl create mode 100644 monitoring/node-exporter-entrypoint.sh create mode 100644 monitoring/prometheus.yml.tmpl create mode 100644 monitoring/promtail.yml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37b52cc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.envrc diff --git a/README.md b/README.md new file mode 100644 index 0000000..860074b --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# new-monitoring-experiment + +WIP. diff --git a/gathering/.dont-delete-me b/gathering/.dont-delete-me new file mode 100644 index 0000000..45b983b --- /dev/null +++ b/gathering/.dont-delete-me @@ -0,0 +1 @@ +hi diff --git a/monitoring/.env.sample b/monitoring/.env.sample new file mode 100644 index 0000000..3822cbc --- /dev/null +++ b/monitoring/.env.sample @@ -0,0 +1,29 @@ +TYPE=monitoring + +GRAFANA_DOMAIN=grafana.example.com +PROMETHEUS_DOMAIN=prometheus.example.com + +LETS_ENCRYPT_ENV=production + +# Edit this in order to allow collection of traefik metrics +#TRAEFIK_METRICS_ENABLED=1 +#TRAEFIK_SERVICE_NAME=traefik_app + +# grafana SMTP configuration (optional) +#GF_SMTP_HOST=changeme +#GF_SMTP_ENABLED=1 +#GF_SMTP_FROM_ADDRESS=grafana@example.com +#GF_SMTP_SKIP_VERIFY=1 + +# Additional grafana settings (unlikely to require editing) +GF_SECURITY_ALLOW_EMBEDDING=1 +GF_INSTALL_PLUGINS=grafana-piechart-panel +GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} + +# Loki stores logs in object storage, fill these up with your +# minio configuration (or any s3-compatible object store) +#LOKI_AWS_ENDPOINT=https://minio/ +#LOKI_AWS_REGION=eu-west-1 +#LOKI_ACCESS_KEY_ID= +#LOKI_SECRET_ACCESS_KEY= +#LOKI_BUCKET_NAMES=loki diff --git a/monitoring/README.md b/monitoring/README.md new file mode 100644 index 0000000..4d74a62 --- /dev/null +++ b/monitoring/README.md @@ -0,0 +1,29 @@ +# Monitoring + +A server and application monitoring stack based on Prometheus, Loki and Grafana. + + +* **Category**: Utilities +* **Status**: 2, beta +* **Images**: [`prom/prometheus`](https://hub.docker.com/r/prom/prometheus) [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana) [`grafana/loki`](https://hub.docker.com/r/grafana/loki), upstream +* **Healthcheck**: 3 +* **Backups**: No +* **Email**: 3 +* **Tests**: No +* **SSO**: 1 + + +## Basic usage + +1. Set up Docker Swarm and [`abra`] +2. Deploy [`coop-cloud/traefik`] +3. Deploy [`coop-cloud/minio`] +4. `abra app new monitoring --secrets` (optionally with `--pass` if you'd like + to save secrets in `pass`) +5. `abra app YOURAPPDOMAIN config` - be sure to change `GRAFANA_DOMAIN` and `PROMETHEUS_DOMAIN` to something that resolves to + your Docker swarm box +6. `abra app YOURAPPDOMAIN deploy` +7. Open the configured domain in your browser to finish set-up + +[`abra`]: https://git.autonomic.zone/autonomic-cooperative/abra +[`coop-cloud/traefik`]: https://git.autonomic.zone/coop-cloud/traefik diff --git a/monitoring/abra.sh b/monitoring/abra.sh new file mode 100644 index 0000000..74bf30a --- /dev/null +++ b/monitoring/abra.sh @@ -0,0 +1,9 @@ +export PROMETHEUS_YML_VERSION=v2 +export PROMTAIL_YML_VERSION=v1 +export LOKI_YML_VERSION=v1 +export NODE_EXPORTER_ENTRYPOINT_VERSION=v1 +export GRAFANA_DATASOURCES_YML_VERSION=v1 +export GRAFANA_DASHBOARDS_YML_VERSION=v1 +export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1 +export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1 +export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1 diff --git a/monitoring/compose.yml b/monitoring/compose.yml new file mode 100644 index 0000000..cb2faa0 --- /dev/null +++ b/monitoring/compose.yml @@ -0,0 +1,129 @@ +--- +version: "3.8" + +services: + app: + image: grafana/grafana + volumes: + - grafana-data:/var/lib/grafana:rw + configs: + - source: grafana_datasources_yml + target: /etc/grafana/provisioning/datasources/datasources.yml + - source: grafana_dashboards_yml + target: /etc/grafana/provisioning/dashboards/dashboards.yml + - source: grafana_swarm_dashboard_json + target: /var/lib/grafana/dashboards/docker-swarm-nodes.json + - source: grafana_stacks_dashboard_json + target: /var/lib/grafana/dashboards/docker-swarm-stacks.json + - source: grafana_traefik_dashboard_json + target: /var/lib/grafana/dashboards/traefik.json + networks: + - proxy + environment: + - GF_SMTP_HOST + - GF_SMTP_ENABLED + - GF_SMTP_FROM_ADDRESS + - GF_SMTP_SKIP_VERIFY + - GF_SECURITY_ALLOW_EMBEDDING + - GF_INSTALL_PLUGINS=grafana-piechart-panel + - GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} + deploy: + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000" + - "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-grafana.tls=true" + - "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}" + healthcheck: + test: "wget -q http://localhost:3000/ -O/dev/null" + interval: 5s + timeout: 10s + retries: 3 + start_period: 10s + + loki: + image: grafana/loki:2.0.0 + command: -config.file=/etc/loki/local-config.yaml + networks: + - loki + - api + deploy: + endpoint_mode: dnsrr + ports: + - target: 3100 + published: 3100 + protocol: tcp + mode: host + configs: + - source: loki_yml + target: /etc/loki/local-config.yaml + volumes: + - loki-data:/loki + + prometheus: + image: prom/prometheus:latest + volumes: + - prometheus-data:/prometheus:rw + configs: + - source: prometheus_yml + target: /etc/prometheus/prometheus.yml + networks: + api: + aliases: + - prometheus_api + exporters: ~ + proxy: ~ + deploy: + restart_policy: + condition: on-failure + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090" + - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}" + healthcheck: + test: "wget -q http://localhost:9090/graph -O/dev/null" + interval: 5s + timeout: 10s + retries: 3 + start_period: 10s + +configs: + prometheus_yml: + template_driver: golang + name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION} + file: prometheus.yml.tmpl + promtail_yml: + name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION} + file: promtail.yml + loki_yml: + template_driver: golang + name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION} + file: loki.yml.tmpl + grafana_datasources_yml: + name: ${STACK_NAME}_grafana_datasources_yml_${GRAFANA_DATASOURCES_YML_VERSION} + file: grafana-datasources.yml + grafana_dashboards_yml: + name: ${STACK_NAME}_grafana_dashboards_yml_${GRAFANA_DASHBOARDS_YML_VERSION} + file: grafana-dashboards.yml + grafana_swarm_dashboard_json: + name: ${STACK_NAME}_grafana_swarm_dashboard_json_${GRAFANA_SWARM_DASHBOARD_JSON_VERSION} + file: grafana-swarm-dashboard.json + grafana_stacks_dashboard_json: + name: ${STACK_NAME}_grafana_stacks_dashboard_json_${GRAFANA_STACKS_DASHBOARD_JSON_VERSION} + file: grafana-stacks-dashboard.json + grafana_traefik_dashboard_json: + name: ${STACK_NAME}_grafana_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION} + file: grafana-traefik-dashboard.json + +volumes: + prometheus-data: + grafana-data: + loki-data: + +networks: + proxy: + external: true diff --git a/monitoring/grafana-dashboards.yml b/monitoring/grafana-dashboards.yml new file mode 100644 index 0000000..8411cca --- /dev/null +++ b/monitoring/grafana-dashboards.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'default-dashboard-provider' + orgId: 1 + folder: 'default-dashboards' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true diff --git a/monitoring/grafana-datasources.yml b/monitoring/grafana-datasources.yml new file mode 100644 index 0000000..a6361fc --- /dev/null +++ b/monitoring/grafana-datasources.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/monitoring/grafana-stacks-dashboard.json b/monitoring/grafana-stacks-dashboard.json new file mode 100644 index 0000000..a3322a2 --- /dev/null +++ b/monitoring/grafana-stacks-dashboard.json @@ -0,0 +1,745 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Simple docker swarm monitoring with relation to individual stacks.", + "editable": true, + "gnetId": 7007, + "graphTooltip": 1, + "id": 2, + "iteration": 1626744694222, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 10, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "(time() - min(container_start_time_seconds{container_label_com_docker_stack_namespace=~\"$stack\"}))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Stack Uptime", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": null, + "fill": 5, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 20, + "x": 4, + "y": 0 + }, + "height": "270px", + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval])) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage per Container", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 3 + }, + "id": 3, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "count(rate(container_last_seen{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "refId": "B", + "step": 4 + } + ], + "title": "Containers", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 7 + }, + "height": "270px", + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_rss{container_label_com_docker_stack_namespace=~\"$stack\"}) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}", + "metric": "container_memory_rss", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage per Container", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "height": "270px", + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval])) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Incoming Network Traffic per Container", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 21 + }, + "height": "270px", + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval])) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Outgoing Network Traffic per Container", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [ + "prometheus", + "cAdvisor", + "node-exporter", + "alertmanager" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "pretix", + "value": "pretix" + }, + "datasource": "Prometheus", + "definition": "", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Stack", + "multi": false, + "name": "stack", + "options": [], + "query": { + "query": "query_result(count(container_last_seen{container_label_com_docker_stack_namespace =~\".+\"}) by (container_label_com_docker_stack_namespace))", + "refId": "Prometheus-stack-Variable-Query" + }, + "refresh": 1, + "regex": "/container_label_com_docker_stack_namespace=\"(.*)\"/", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 50, + "auto_min": "50s", + "current": { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + "description": null, + "error": null, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "7m", + "value": "7m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "30s,1m,2m,3m,5m,7m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Swarm Stack Monitoring", + "uid": "KdVoGQm7z", + "version": 1 +} diff --git a/monitoring/grafana-swarm-dashboard.json b/monitoring/grafana-swarm-dashboard.json new file mode 100644 index 0000000..6c9468a --- /dev/null +++ b/monitoring/grafana-swarm-dashboard.json @@ -0,0 +1,1729 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker Swarm nodes metrics", + "editable": true, + "gnetId": 7461, + "graphTooltip": 0, + "id": 1, + "iteration": 1626741910840, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "green", + "value": 86400 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "hideTimeOverride": true, + "id": 2, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "topk(1, sum((node_time_seconds - node_boot_time_seconds) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) by (node_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1m", + "timeShift": null, + "title": "Uptime", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 1, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "count(node_meta * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 8 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "hideTimeOverride": true, + "id": 4, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "count(node_cpu_seconds_total{mode=\"idle\"} * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1m", + "timeShift": null, + "title": "CPUs", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "hideTimeOverride": true, + "id": 11, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) * 100 / count(node_cpu_seconds_total{mode=\"user\"} * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1m", + "timeShift": null, + "title": "CPU Idle", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load5 * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "load5 {{node_name}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "System Load by Node", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"} * 100) by (node_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node_name}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage by Node", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 11 + }, + "hideTimeOverride": true, + "id": 3, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Memory", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 8, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "sum((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "title": "Available Memory", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 11 + }, + "hideTimeOverride": true, + "id": 9, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "exemplar": true, + "expr": "sum(node_filesystem_size_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"} * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Disk Space", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 11 + }, + "id": 10, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "exemplar": true, + "expr": "sum((node_filesystem_free_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"} / node_filesystem_size_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"}) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"} * 100) / count(node_meta * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "title": "Available Disk Space", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Cached_bytes - node_memory_Buffers_bytes - node_memory_Slab_bytes) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) by (node_name)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Used {{node_name}}", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(node_memory_Cached * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Cached {{node_name}}", + "refId": "B", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory usage by Node", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(node_disk_read_bytes_total[$interval]) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) by (node_name)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Read {{node_name}}", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(irate(node_disk_written_bytes_total[$interval]) * on(instance) group_left(node_name) node_meta{node_id=~\"$node_id\"}) by (node_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Written {{node_name}}", + "refId": "B", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk I/O by Node", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(node_disk_reads_completed_total[$interval]) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) by (node_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads {{node_name}}", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(irate(node_disk_writes_completed_total[$interval]) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"}) by (node_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes {{node_name}}", + "refId": "B", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "IOPS by Node", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(avg(irate(node_cpu_seconds_total{mode=\"iowait\"}[$interval]) * on(instance) group_left(node_name) node_meta{node_name=~\"$node_id\"} * 100) by (node_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node_name}}", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU IO Wait by Node", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 0, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 18, + "x": 0, + "y": 36 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_last_seen[5m]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta{node_name=~\"${node_id}\"}) by (container_label_com_docker_swarm_service_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{ container_label_com_docker_swarm_service_name }}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Running Containers by Service", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 36 + }, + "id": 7, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.6", + "targets": [ + { + "expr": "count(rate(container_last_seen[5m]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta{node_name=~\"$node_id\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 20 + } + ], + "title": "Total Containers", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 43 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total[$interval]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta{node_name=~\"${node_id}\"}) by (node_name)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "IN {{node_name}}", + "refId": "A", + "step": 2 + }, + { + "expr": "- sum(rate(container_network_transmit_bytes_total[$interval]) * on(container_label_com_docker_swarm_node_id) group_left(node_name) node_meta{node_name=~\"${node_id}\"}) by (node_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "OUT {{node_name}}", + "metric": "", + "refId": "B", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Containers Network Traffic by Node", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [], + "datasource": "Prometheus", + "fontSize": "100%", + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 50 + }, + "hideTimeOverride": true, + "id": 20, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(node_meta) by (node_id, node_name, instance)", + "format": "table", + "intervalFactor": 2, + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1s", + "title": "Cluster members", + "transform": "table", + "type": "table-old" + } + ], + "refresh": "5s", + "schemaVersion": 30, + "style": "dark", + "tags": [ + "swarmprom", + "prometheus", + "node-exporter", + "cadvisor" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "Prometheus", + "definition": "", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "Swarm Node", + "multi": false, + "name": "node_id", + "options": [], + "query": { + "query": "node_meta", + "refId": "Prometheus-node_id-Variable-Query" + }, + "refresh": 1, + "regex": "/node_name=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "30s", + "current": { + "selected": false, + "text": "1m", + "value": "1m" + }, + "description": null, + "error": null, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Docker Swarm Nodes", + "uid": "BPlb-Sgik", + "version": 4 +} diff --git a/monitoring/grafana-traefik-dashboard.json b/monitoring/grafana-traefik-dashboard.json new file mode 100644 index 0000000..78b6c14 --- /dev/null +++ b/monitoring/grafana-traefik-dashboard.json @@ -0,0 +1,1536 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Simple dashboard for Traefik 2", + "editable": true, + "gnetId": 11462, + "graphTooltip": 0, + "id": 4, + "iteration": 1628548257934, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "green", + "value": 86400 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 22, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.3.5", + "targets": [ + { + "expr": "time() - process_start_time_seconds{job=\"$job\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#37872D", + "value": null + }, + { + "color": "#C4162A", + "value": 1 + }, + { + "color": "#C4162A", + "value": 5 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 3, + "y": 0 + }, + "id": 32, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.3.5", + "targets": [ + { + "expr": "sum(increase(traefik_service_requests_total{code=\"499\"}[$interval]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Max Service Error count ($interval)", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#37872D", + "value": null + }, + { + "color": "#37872D", + "value": 0 + }, + { + "color": "rgb(156, 156, 156)", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 7, + "y": 0 + }, + "id": 26, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "count" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.3.5", + "targets": [ + { + "expr": "sum(increase(traefik_service_requests_total{code=\"404\",protocol=~\"$protocol\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "traefik_requests_total", + "refId": "A", + "step": 60 + } + ], + "title": "Max 404 Error Count ($interval)", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#C4162A", + "value": null + }, + { + "color": "#FF9830", + "value": 10 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 10, + "y": 0 + }, + "id": 33, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.3.5", + "targets": [ + { + "expr": "sum(increase(traefik_service_requests_total{code=\"200\",protocol=~\"$protocol\"}[$interval]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "traefik_requests_total", + "refId": "A", + "step": 60 + } + ], + "title": "200 Status Count (last $interval)", + "type": "stat" + }, + { + "aliasColors": {}, + "breakPoint": "50%", + "cacheTimeout": null, + "combine": { + "label": "Others", + "threshold": 0 + }, + "datasource": "Prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "80%", + "format": "short", + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 0 + }, + "id": 18, + "interval": null, + "legend": { + "percentage": true, + "show": true, + "sort": null, + "sortDesc": null, + "values": true + }, + "legendType": "Right side", + "links": [], + "maxDataPoints": 3, + "nullPointMode": "connected", + "pieType": "pie", + "strokeWidth": 1, + "targets": [ + { + "expr": "topk(5, sum(traefik_service_requests_total{protocol=~\"$protocol\"}) by (code))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}}", + "refId": "A" + } + ], + "title": "Top 5 $protocol return codes", + "type": "grafana-piechart-panel", + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 10, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(traefik_service_requests_total{code=\"499\",method=\"GET\"}[$interval])) by (service)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{service}} ", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bad Status Code Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [], + "unit": "ms" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 10, + "x": 10, + "y": 6 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(traefik_entrypoint_request_duration_seconds_sum{protocol=~\"$protocol\"}) / sum(traefik_entrypoint_requests_total{protocol=~\"$protocol\"}) * 1000) - (sum(traefik_service_request_duration_seconds_sum{protocol=~\"$protocol\"}) / sum(traefik_service_requests_total{protocol=~\"$protocol\"}) * 1000)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Average processing time (ms)", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average processing time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 10, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 10 + }, + { + "color": "#EAB839", + "value": 200 + }, + { + "color": "red", + "value": 500 + }, + { + "color": "purple", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 6 + }, + "id": 20, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.3.5", + "targets": [ + { + "expr": "sum(traefik_entrypoint_request_duration_seconds_sum) / sum(traefik_entrypoint_requests_total) * 1000", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Average response time", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 17, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "hideTimeOverride": false, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(delta(traefik_service_requests_total[$interval]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Total requests", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 7, + "x": 17, + "y": 12 + }, + "hideTimeOverride": false, + "id": 34, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "7.3.5", + "targets": [ + { + "expr": "sum(rate(traefik_service_requests_total[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total requests", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests per second", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 10, + "x": 0, + "y": 18 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "process_open_fds{job=~\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Used sockets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 14, + "x": 10, + "y": 18 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_service_requests_total{protocol=~\"http|https\"}[$interval])) by (service)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{service}} ", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Access to services", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 7, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_entrypoint_open_connections) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ENTRYPOINT - Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 7, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_service_open_connections) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SERVICE - Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 31 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/^[^234].*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(traefik_service_requests_total{protocol=~\"$protocol\"}[$interval])) by (code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}}", + "refId": "A", + "step": 120 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Status Code Count ", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "traefik", + "load-balancer", + "docker", + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "traefik", + "value": "traefik" + }, + "datasource": "Prometheus", + "definition": "", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Job:", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "Prometheus", + "definition": "label_values(traefik_service_requests_total, protocol)", + "error": null, + "hide": 0, + "includeAll": true, + "label": "Service:", + "multi": true, + "name": "protocol", + "options": [], + "query": "label_values(traefik_service_requests_total, protocol)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + "error": null, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Traefik 2", + "uid": "3ipsWfViz", + "version": 4 +} diff --git a/monitoring/loki.yml.tmpl b/monitoring/loki.yml.tmpl new file mode 100644 index 0000000..a899331 --- /dev/null +++ b/monitoring/loki.yml.tmpl @@ -0,0 +1,71 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +distributor: + ring: + kvstore: + store: memberlist + +ingester: + lifecycler: + ring: + kvstore: + store: memberlist + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 5m + chunk_retain_period: 30s + +memberlist: + abort_if_cluster_join_fails: false + + # Expose this port on all distributor, ingester + # and querier replicas. + bind_port: 7946 + + # You can use a headless k8s service for all distributor, + # ingester and querier components. + join_members: + - loki:7946 + + max_join_backoff: 1m + max_join_retries: 10 + min_join_backoff: 1s + +schema_config: + configs: + - from: 2020-11-25 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + resync_interval: 5s + shared_store: aws + + aws: + endpoint: {{ env LOKI_AWS_ENDPOINT }} + region: {{ env LOKI_AWS_REGION }} + access_key_id: {{ env LOKI_ACCESS_KEY_ID }} + secret_access_key: {{ env LOKI_SECRET_ACCESS_KEY }} + bucketnames: {{ env_LOKI_BUCKET_NAMES }} + insecure: false + sse_encryption: false + http_config: + idle_conn_timeout: 90s + response_header_timeout: 0s + insecure_skip_verify: false + s3forcepathstyle: true + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h diff --git a/monitoring/node-exporter-entrypoint.sh b/monitoring/node-exporter-entrypoint.sh new file mode 100644 index 0000000..72e2f0b --- /dev/null +++ b/monitoring/node-exporter-entrypoint.sh @@ -0,0 +1,9 @@ +#!/bin/sh -e + +NODE_NAME=$(cat /etc/nodename) +mkdir -p /etc/node-exporter +echo "node_meta{node_id=\"$NODE_ID\", container_label_com_docker_swarm_node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" > /etc/node-exporter/node-meta.prom + +set -- /bin/node_exporter "$@" + +exec "$@" diff --git a/monitoring/prometheus.yml.tmpl b/monitoring/prometheus.yml.tmpl new file mode 100644 index 0000000..2547b27 --- /dev/null +++ b/monitoring/prometheus.yml.tmpl @@ -0,0 +1,58 @@ +global: + scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: + - localhost:9090 + + # http://node_exporter:9100/metrics + - job_name: node-exporter + scrape_interval: 10s + metrics_path: "/metrics" + dns_sd_configs: + - names: + - 'tasks.node_exporter' + type: 'A' + port: 9100 + + + - job_name: 'cadvisor' + scrape_interval: 30s + metrics_path: '/metrics' + dns_sd_configs: + - names: + - 'tasks.cadvisor' + type: 'A' + port: 8080 + +{{ if eq (env "TRAEFIK_METRICS_ENABLED") "1" }} + - job_name: 'traefik' + scrape_interval: 30s + metrics_path: '/metrics' + dns_sd_configs: + - names: + - 'tasks.{{ (env "TRAEFIK_SERVICE_NAME") }}' + type: 'A' + port: 8082 +{{ end }} diff --git a/monitoring/promtail.yml b/monitoring/promtail.yml new file mode 100644 index 0000000..ed06e8c --- /dev/null +++ b/monitoring/promtail.yml @@ -0,0 +1,18 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: +- job_name: system + static_configs: + - targets: + - localhost + labels: + job: varlogs + __path__: /var/log/*log