Compare commits

...

21 Commits

Author SHA1 Message Date
23788856fe attempt to trim config 2022-03-31 14:03:20 +02:00
912daa0f2a mail mail mail 2022-03-31 13:04:09 +02:00
769fc6861a make loki work 2022-03-30 13:05:19 +02:00
7e7e7b623f setup root route 2022-03-30 12:33:43 +02:00
e6adc17974 allow grafana experimental alert manager 2022-03-30 12:14:53 +02:00
c77da9de1d hacking, things work 2022-03-18 16:19:19 +01:00
e7dd69f38b hackity hack hack loki now working 2022-03-18 14:50:28 +01:00
5ef8936388 move into root 2022-03-18 12:53:20 +01:00
98b5dc44bf point to gathering 2022-03-18 10:28:11 +01:00
4ae2a59737 loki auth 2022-03-18 10:20:27 +01:00
77f93471fa more secrets, configs & alertmanager 2022-03-17 22:11:51 +01:00
0a77e4d00f drop healthcheck, we have basic auth now 2022-03-17 21:41:02 +01:00
9612d666aa nearly there with auth 2022-03-17 16:50:34 +01:00
ae8aad0b38 add password generator 2022-03-17 13:05:43 +01:00
b3cc291d39 admin password for grafana 2022-03-17 13:05:09 +01:00
b8535eaea9 add links 2022-03-17 12:56:57 +01:00
23ca5c1948 thread env 2022-03-17 12:54:47 +01:00
5346f50409 fix env/secret loading 2022-03-17 12:49:29 +01:00
0416d34fda fix aws secret access key version name 2022-03-17 12:46:03 +01:00
257e285932 more specific instructions for monitoring 2022-03-17 12:44:09 +01:00
e5463521dc drop that config 2022-03-17 12:41:20 +01:00
21 changed files with 429 additions and 4276 deletions

View File

@ -1,13 +1,34 @@
# new-monitoring-experiment ## monitoring
https://pad.autonomic.zone/lyCDVQbaSpusOtIZXHzHMA?view#2022-03-17-monitoring-co-hack > grafana/loki/prometheus
## hacking - [g.monitor.autonomic.zone](https://g.monitor.autonomic.zone)
- [p.monitor.autonomic.zone](https://p.monitor.autonomic.zone)
- [l.monitor.autonomic.zone](https://l.monitor.autonomic.zone)
We're deploying stuff by hand for now while we hack away on experimental things. ## setup
``` ```
set -a && source .env.sample && set +a printf $(pass show hosts/swarm.autonomic.zone/minio/secret_key) | docker secret create gp_monitoring_loki_aws_secret_access_key_v1 -
docker context use $yourremotecontext printf password | docker secret create gp_monitoring_grafana_admin_password_v1 -
docker stack deploy -c compose.yml $yourstackname printf <...> | docker secret create gp_monitoring_grafana_oauth_client_secret_v1 -
pwgen -s 64 1; ./scripts/genpw.py # input password & get hashed output for secret
printf <...> | docker secret create gp_monitoring_prometheus_admin_password_v1 -
pwgen -s 64 1; ./scripts/genpw.py # input password & get hashed output for secret
printf <...> | docker secret create gp_monitoring_loki_admin_password_v1 -
printf <...> | docker secret create gp_monitoring_alertmanager_smtp_password_v1 -
printf <...> | docker secret create gp_monitoring_grafana_smtp_password_v1 -
set -a && source env && set +a
docker context use monitor.autonomic.zone
docker stack deploy -c compose.yml gp_monitoring
``` ```
## questions / TODO
- [ ] how to load in secrets from multiple hosts & keep a recipe generic?
- [ ] basic auth on gathering stack

13
alertmanager.yml.tmpl Normal file
View File

@ -0,0 +1,13 @@
global:
smtp_from: {{ env "ALERTMANAGER_SMTP_FROM" }}
smtp_smarthost: {{ env "ALERTMANAGER_SMTP_HOST" }}
smtp_auth_username: {{ env "ALERTMANAGER_SMTP_FROM" }}
smtp_auth_password: {{ secret "alertmanager_smtp_password" }}
route:
receiver: "kaboom-mailer"
receivers:
- name: "kaboom-mailer"
email_configs:
- to: {{ env "ALERTMANAGER_SMTP_TO" }}

205
compose.yml Normal file
View File

@ -0,0 +1,205 @@
---
version: "3.8"
services:
app:
image: grafana/grafana:8.4.4
volumes:
- grafana-data:/var/lib/grafana:rw
secrets:
- grafana_admin_password
- grafana_oauth_client_secret
configs:
- source: grafana_custom_ini
target: /etc/grafana/grafana.ini
networks:
- proxy
- internal
environment:
- GF_SMTP_HOST
- GF_SMTP_ENABLED
- GF_SMTP_FROM_ADDRESS
- GF_SMTP_SKIP_VERIFY
- GF_INSTALL_PLUGINS=grafana-piechart-panel
- GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
- KEYCLOAK_API_URL
- KEYCLOAK_AUTH_URL
- KEYCLOAK_TOKEN_URL
deploy:
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
healthcheck:
test: "wget -q http://localhost:3000/ -O/dev/null"
interval: 5s
timeout: 10s
retries: 3
start_period: 10s
prometheus:
image: prom/prometheus:v2.34.0
secrets:
- prometheus_admin_password
- prometheus_admin_password_hashed
- swarm_demo_admin_password
volumes:
- prometheus-data:/prometheus:rw
configs:
- source: prometheus_yml
target: /etc/prometheus/prometheus.yml
- source: prometheus_web_yml
target: /etc/prometheus/prometheus_web.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.config.file=/etc/prometheus/prometheus_web.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
networks:
- proxy
- internal
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
alertmanager:
image: prom/alertmanager:v0.23.0
volumes:
- alertmanager-data:/etc/alertmanager
command:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager"
networks:
- internal
secrets:
- alertmanager_smtp_password
configs:
- source: alertmanager_config
target: /etc/alertmanager/config.yml
environment:
- ALERTMANAGER_SMTP_FROM
- ALERTMANAGER_SMTP_HOST
- ALERTMANAGER_SMTP_TO
web:
image: nginx:1.20.0
networks:
- proxy
- internal
environment:
- LOKI_DOMAIN
- STACK_NAME
configs:
- source: nginx_config
target: /etc/nginx/nginx.conf
- source: htpasswd_conf
target: /etc/nginx/conf.d/loki.htpasswd
secrets:
- loki_admin_password_hashed
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-web.loadbalancer.server.port=80"
- "traefik.http.routers.${STACK_NAME}-web.rule=Host(`${LOKI_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-web.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-web.tls.certresolver=${LETS_ENCRYPT_ENV}"
loki:
image: grafana/loki:2.0.0
command: -config.file=/etc/loki/local-config.yaml
networks:
- internal
configs:
- source: loki_yml
target: /etc/loki/local-config.yaml
volumes:
- loki-data:/loki
secrets:
- loki_aws_secret_access_key
environment:
- LOKI_ACCESS_KEY_ID
- LOKI_AWS_ENDPOINT
- LOKI_AWS_REGION
- LOKI_BUCKET_NAMES
- STACK_NAME
configs:
grafana_custom_ini:
template_driver: golang
name: ${STACK_NAME}_grafana_custom_ini_${GRAFANA_CUSTOM_INI_VERSION}
file: grafana_custom.ini
prometheus_yml:
template_driver: golang
name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION}
file: prometheus.yml.tmpl
prometheus_web_yml:
template_driver: golang
name: ${STACK_NAME}_prometheus_web_yml_${PROMETHEUS_WEB_YML_VERSION}
file: prometheus_web.yml.tmpl
loki_yml:
template_driver: golang
name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION}
file: loki.yml.tmpl
alertmanager_config:
template_driver: golang
name: ${STACK_NAME}_alertmanager_config_${ALERTMANAGER_CONFIG_VERSION}
file: ./alertmanager.yml.tmpl
nginx_config:
template_driver: golang
name: ${STACK_NAME}_nginx_config_${NGINX_CONFIG_VERSION}
file: nginx.conf.tmpl
htpasswd_conf:
template_driver: golang
name: ${STACK_NAME}_htpasswd_${HTPASSWD_CONFIG_VERSION}
file: loki.htpasswd.tmpl
volumes:
prometheus-data:
grafana-data:
loki-data:
alertmanager-data:
networks:
proxy:
external: true
internal:
secrets:
loki_aws_secret_access_key:
external: true
name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
grafana_admin_password:
external: true
name: ${STACK_NAME}_grafana_admin_password_${SECRET_GRAFANA_ADMIN_PASSWORD_VERSION}
grafana_oauth_client_secret:
external: true
name: ${STACK_NAME}_grafana_oauth_client_secret_${SECRET_GRAFANA_OAUTH_CLIENT_SECRET_VERSION}
prometheus_admin_password_hashed:
external: true
name: ${STACK_NAME}_prometheus_admin_password_hashed_${SECRET_PROMETHEUS_ADMIN_PASSWORD_HASHED_VERSION}
prometheus_admin_password:
external: true
name: ${STACK_NAME}_prometheus_admin_password_${SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION}
alertmanager_smtp_password:
external: true
name: ${STACK_NAME}_alertmanager_smtp_password_${SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION}
loki_admin_password_hashed:
external: true
name: ${STACK_NAME}_loki_admin_password_hashed_${SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION}
swarm_demo_admin_password:
external: true
name: ${STACK_NAME}_swarm_demo_admin_password_${SECRET_SWARM_DEMO_ADMIN_PASSWORD_VERSION}

40
env Normal file
View File

@ -0,0 +1,40 @@
TYPE=monitoring
STACK_NAME=gp_monitoring
LETS_ENCRYPT_ENV=production
GRAFANA_DOMAIN=g.monitor.autonomic.zone
GRAFANA_CUSTOM_INI_VERSION=v3
GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
SECRET_GRAFANA_OAUTH_CLIENT_SECRET_VERSION=v1
PROMETHEUS_DOMAIN=p.monitor.autonomic.zone
PROMETHEUS_YML_VERSION=v10
PROMETHEUS_WEB_YML_VERSION=v2
SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1
SECRET_PROMETHEUS_ADMIN_PASSWORD_HASHED_VERSION=v1
LOKI_DOMAIN=l.monitor.autonomic.zone
LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
LOKI_AWS_REGION=eu-west-1
LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
LOKI_BUCKET_NAMES=loki
LOKI_YML_VERSION=v7
SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION=v1
ALERTMANAGER_CONFIG_VERSION=v2
NGINX_CONFIG_VERSION=v5
HTPASSWD_CONFIG_VERSION=v1
KEYCLOAK_AUTH_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/auth"
KEYCLOAK_API_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/userinfo"
KEYCLOAK_TOKEN_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/token"
ALERTMANAGER_SMTP_FROM=noreply@autonomic.zone
ALERTMANAGER_SMTP_HOST=mail.gandi.net:587
ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone
SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1
SECRET_SWARM_DEMO_ADMIN_PASSWORD_VERSION=v1

View File

@ -1 +0,0 @@
hi

30
grafana_custom.ini Normal file
View File

@ -0,0 +1,30 @@
[analytics]
reporting_enabled = false
[snapshots]
external_enabled = false
[users]
auto_assign_org_role = Admin
[auth]
disable_login_form = true
[auth.generic_oauth]
enabled = true
scopes = openid email profile
name = id.autonomic.zone
icon = signin
tls_skip_verify_insecure = false
allow_sign_up = true
client_id = grafana
client_secret = {{ secret "grafana_oauth_client_secret" }}
auth_url = {{ env "KEYCLOAK_AUTH_URL" }}
token_url = {{ env "KEYCLOAK_TOKEN_URL" }}
api_url = {{ env "KEYCLOAK_API_URL" }}
[auth.basic]
enabled = false
[plugins]
enable_alpha = true

1
loki.htpasswd.tmpl Normal file
View File

@ -0,0 +1 @@
loki:{{ secret "loki_admin_password_hashed" }}

View File

@ -1,5 +1,18 @@
auth_enabled: false auth_enabled: false
ruler:
storage:
type: local
local:
directory: /loki/rules
rule_path: /loki/scratch
alertmanager_url: http://alertmanager:9093
enable_api: true
enable_alertmanager_v2: true
ring:
kvstore:
store: inmemory
server: server:
http_listen_port: 3100 http_listen_port: 3100
@ -20,16 +33,9 @@ ingester:
memberlist: memberlist:
abort_if_cluster_join_fails: false abort_if_cluster_join_fails: false
# Expose this port on all distributor, ingester
# and querier replicas.
bind_port: 7946 bind_port: 7946
# You can use a headless k8s service for all distributor,
# ingester and querier components.
join_members: join_members:
- loki:7946 - {{ env "STACK_NAME" }}_loki:7946
max_join_backoff: 1m max_join_backoff: 1m
max_join_retries: 10 max_join_retries: 10
min_join_backoff: 1s min_join_backoff: 1s
@ -52,11 +58,11 @@ storage_config:
shared_store: aws shared_store: aws
aws: aws:
endpoint: {{ env LOKI_AWS_ENDPOINT }} endpoint: {{ env "LOKI_AWS_ENDPOINT" }}
region: {{ env LOKI_AWS_REGION }} region: {{ env "LOKI_AWS_REGION" }}
access_key_id: {{ env LOKI_ACCESS_KEY_ID }} access_key_id: {{ env "LOKI_ACCESS_KEY_ID" }}
secret_access_key: {{ secret loki_aws_secret_access_key }} secret_access_key: {{ secret "loki_aws_secret_access_key" }}
bucketnames: {{ env_LOKI_BUCKET_NAMES }} bucketnames: {{ env "LOKI_BUCKET_NAMES" }}
insecure: false insecure: false
sse_encryption: false sse_encryption: false
http_config: http_config:

View File

@ -1,131 +0,0 @@
---
version: "3.8"
services:
app:
image: grafana/grafana:8.4.4
volumes:
- grafana-data:/var/lib/grafana:rw
configs:
- source: grafana_datasources_yml
target: /etc/grafana/provisioning/datasources/datasources.yml
- source: grafana_dashboards_yml
target: /etc/grafana/provisioning/dashboards/dashboards.yml
- source: grafana_swarm_dashboard_json
target: /var/lib/grafana/dashboards/docker-swarm-nodes.json
- source: grafana_stacks_dashboard_json
target: /var/lib/grafana/dashboards/docker-swarm-stacks.json
- source: grafana_traefik_dashboard_json
target: /var/lib/grafana/dashboards/traefik.json
networks:
- proxy
- internal
environment:
- GF_SMTP_HOST
- GF_SMTP_ENABLED
- GF_SMTP_FROM_ADDRESS
- GF_SMTP_SKIP_VERIFY
- GF_SECURITY_ALLOW_EMBEDDING
- GF_INSTALL_PLUGINS=grafana-piechart-panel
- GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
deploy:
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
healthcheck:
test: "wget -q http://localhost:3000/ -O/dev/null"
interval: 5s
timeout: 10s
retries: 3
start_period: 10s
prometheus:
image: prom/prometheus:v2.34.0
volumes:
- prometheus-data:/prometheus:rw
configs:
- source: prometheus_yml
target: /etc/prometheus/prometheus.yml
networks:
- proxy
- internal
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
healthcheck:
test: "wget -q http://localhost:9090/graph -O/dev/null"
interval: 5s
timeout: 10s
retries: 3
start_period: 10s
loki:
image: grafana/loki:2.0.0
command: -config.file=/etc/loki/local-config.yaml
networks:
- internal
deploy:
endpoint_mode: dnsrr
ports:
- target: 3100
published: 3100
protocol: tcp
mode: host
configs:
- source: loki_yml
target: /etc/loki/local-config.yaml
volumes:
- loki-data:/loki
secrets:
- loki_aws_secret_access_key
configs:
prometheus_yml:
template_driver: golang
name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION}
file: prometheus.yml.tmpl
loki_yml:
template_driver: golang
name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION}
file: loki.yml.tmpl
grafana_datasources_yml:
name: ${STACK_NAME}_grafana_datasources_yml_${GRAFANA_DATASOURCES_YML_VERSION}
file: grafana-datasources.yml
grafana_dashboards_yml:
name: ${STACK_NAME}_grafana_dashboards_yml_${GRAFANA_DASHBOARDS_YML_VERSION}
file: grafana-dashboards.yml
grafana_swarm_dashboard_json:
name: ${STACK_NAME}_grafana_swarm_dashboard_json_${GRAFANA_SWARM_DASHBOARD_JSON_VERSION}
file: grafana-swarm-dashboard.json
grafana_stacks_dashboard_json:
name: ${STACK_NAME}_grafana_stacks_dashboard_json_${GRAFANA_STACKS_DASHBOARD_JSON_VERSION}
file: grafana-stacks-dashboard.json
grafana_traefik_dashboard_json:
name: ${STACK_NAME}_grafana_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION}
file: grafana-traefik-dashboard.json
volumes:
prometheus-data:
grafana-data:
loki-data:
networks:
proxy:
external: true
internal:
secrets:
loki_aws_secret_access_key:
external: true
name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_ACCESS_KEY_VERSION}

View File

@ -1,43 +0,0 @@
TYPE=monitoring
STACK_NAME=gp_monitoring
GRAFANA_DOMAIN=g.monitor.autonomic.zone
PROMETHEUS_DOMAIN=p.monitor.autonomic.zone
LETS_ENCRYPT_ENV=production
# Edit this in order to allow collection of traefik metrics
#TRAEFIK_METRICS_ENABLED=1
#TRAEFIK_SERVICE_NAME=traefik_app
# grafana SMTP configuration (optional)
#GF_SMTP_HOST=changeme
#GF_SMTP_ENABLED=1
#GF_SMTP_FROM_ADDRESS=grafana@example.com
#GF_SMTP_SKIP_VERIFY=1
# Additional grafana settings (unlikely to require editing)
GF_SECURITY_ALLOW_EMBEDDING=1
GF_INSTALL_PLUGINS=grafana-piechart-panel
GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
# Loki stores logs in object storage, fill these up with your
# minio configuration (or any s3-compatible object store)
LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
LOKI_AWS_REGION=eu-west-1
LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
LOKI_BUCKET_NAMES=loki
# NOTE(d1): abra.sh env vars, while we deploy things manually
PROMETHEUS_YML_VERSION=v2
PROMTAIL_YML_VERSION=v1
LOKI_YML_VERSION=v1
NODE_EXPORTER_ENTRYPOINT_VERSION=v1
GRAFANA_DATASOURCES_YML_VERSION=v1
GRAFANA_DASHBOARDS_YML_VERSION=v1
GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1
GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1
GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1
SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1

View File

@ -1,14 +0,0 @@
---
apiVersion: 1
providers:
- name: 'default-dashboard-provider'
orgId: 1
folder: 'default-dashboards'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true

View File

@ -1,11 +0,0 @@
---
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
isDefault: true
editable: false

View File

@ -1,745 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Simple docker swarm monitoring with relation to individual stacks.",
"editable": true,
"gnetId": 7007,
"graphTooltip": 1,
"id": 2,
"iteration": 1626744694222,
"links": [],
"panels": [
{
"cacheTimeout": null,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 0,
"y": 0
},
"id": 10,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.0.6",
"targets": [
{
"expr": "(time() - min(container_start_time_seconds{container_label_com_docker_stack_namespace=~\"$stack\"}))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "Stack Uptime",
"type": "stat"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": null,
"fill": 5,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 20,
"x": 4,
"y": 0
},
"height": "270px",
"hiddenSeries": false,
"id": 1,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.6",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval])) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CPU Usage per Container",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 0,
"y": 3
},
"id": 3,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "8.0.6",
"targets": [
{
"expr": "count(rate(container_last_seen{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"refId": "B",
"step": 4
}
],
"title": "Containers",
"type": "stat"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 3,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 7
},
"height": "270px",
"hiddenSeries": false,
"id": 5,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideZero": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.6",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_rss{container_label_com_docker_stack_namespace=~\"$stack\"}) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}",
"metric": "container_memory_rss",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Memory Usage per Container",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 14
},
"height": "270px",
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.6",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval])) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Incoming Network Traffic per Container",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 21
},
"height": "270px",
"hiddenSeries": false,
"id": 8,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.0.6",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{container_label_com_docker_stack_namespace=~\"$stack\"}[$interval])) by (container_label_com_docker_swarm_service_name, container_label_com_docker_swarm_task_id)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ container_label_com_docker_swarm_service_name }} - {{ container_label_com_docker_swarm_task_id }}",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Outgoing Network Traffic per Container",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 30,
"style": "dark",
"tags": [
"prometheus",
"cAdvisor",
"node-exporter",
"alertmanager"
],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": false,
"text": "pretix",
"value": "pretix"
},
"datasource": "Prometheus",
"definition": "",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "Stack",
"multi": false,
"name": "stack",
"options": [],
"query": {
"query": "query_result(count(container_last_seen{container_label_com_docker_stack_namespace =~\".+\"}) by (container_label_com_docker_stack_namespace))",
"refId": "Prometheus-stack-Variable-Query"
},
"refresh": 1,
"regex": "/container_label_com_docker_stack_namespace=\"(.*)\"/",
"skipUrlSync": false,
"sort": 2,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"auto": true,
"auto_count": 50,
"auto_min": "50s",
"current": {
"selected": false,
"text": "auto",
"value": "$__auto_interval_interval"
},
"description": null,
"error": null,
"hide": 0,
"label": "Interval",
"name": "interval",
"options": [
{
"selected": true,
"text": "auto",
"value": "$__auto_interval_interval"
},
{
"selected": false,
"text": "30s",
"value": "30s"
},
{
"selected": false,
"text": "1m",
"value": "1m"
},
{
"selected": false,
"text": "2m",
"value": "2m"
},
{
"selected": false,
"text": "3m",
"value": "3m"
},
{
"selected": false,
"text": "5m",
"value": "5m"
},
{
"selected": false,
"text": "7m",
"value": "7m"
},
{
"selected": false,
"text": "10m",
"value": "10m"
},
{
"selected": false,
"text": "30m",
"value": "30m"
},
{
"selected": false,
"text": "1h",
"value": "1h"
},
{
"selected": false,
"text": "6h",
"value": "6h"
},
{
"selected": false,
"text": "12h",
"value": "12h"
},
{
"selected": false,
"text": "1d",
"value": "1d"
},
{
"selected": false,
"text": "7d",
"value": "7d"
},
{
"selected": false,
"text": "14d",
"value": "14d"
},
{
"selected": false,
"text": "30d",
"value": "30d"
}
],
"query": "30s,1m,2m,3m,5m,7m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
"refresh": 2,
"skipUrlSync": false,
"type": "interval"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Swarm Stack Monitoring",
"uid": "KdVoGQm7z",
"version": 1
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +0,0 @@
global:
scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- localhost:9090

View File

@ -1,19 +0,0 @@
---
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: system
static_configs:
- targets:
- localhost
labels:
job: varlogs
__path__: /var/log/*log

43
nginx.conf.tmpl Normal file
View File

@ -0,0 +1,43 @@
user www-data;
events {
worker_connections 768;
}
http {
include /etc/nginx/mime.types;
map $http_upgrade $connection_upgrade {
default upgrade;
'' close;
}
server {
listen 80;
server_name {{ env "LOKI_DOMAIN" }};
auth_basic "loki";
auth_basic_user_file /etc/nginx/conf.d/loki.htpasswd;
location / {
proxy_read_timeout 1800s;
proxy_connect_timeout 1600s;
proxy_pass http://{{ env "STACK_NAME" }}_loki:3100;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_set_header Connection "Keep-Alive";
proxy_set_header Proxy-Connection "Keep-Alive";
proxy_redirect off;
}
location /ready {
proxy_pass http://{{ env "STACK_NAME" }}_loki:3100;
proxy_http_version 1.1;
proxy_set_header Connection "Keep-Alive";
proxy_set_header Proxy-Connection "Keep-Alive";
proxy_redirect off;
auth_basic "off";
}
}
}

36
prometheus.yml.tmpl Normal file
View File

@ -0,0 +1,36 @@
global:
scrape_interval: 30s
evaluation_interval: 30s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets:
- localhost:9090
basic_auth:
username: 'admin'
password: '{{ secret "prometheus_admin_password" }}'
- job_name: 'swarm_demo_node_exporter'
metrics_path: '/node_exporter/metrics'
static_configs:
- targets:
- swarm-demo.gather.autonomic.zone
basic_auth:
username: 'admin'
password: '{{ secret "swarm_demo_admin_password" }}'
- job_name: 'swarm_demo_cadvisor'
metrics_path: '/cadvisor/metrics'
static_configs:
- targets:
- swarm-demo.gather.autonomic.zone
basic_auth:
username: 'admin'
password: '{{ secret "swarm_demo_admin_password" }}'

2
prometheus_web.yml.tmpl Normal file
View File

@ -0,0 +1,2 @@
basic_auth_users:
admin: {{ secret "prometheus_admin_password_hashed" }}

12
scripts/genpw.py Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
# https://prometheus.io/docs/guides/basic-auth/
# maya need to `apt install python3-bcrypt`
import getpass
import bcrypt
password = getpass.getpass("password: ")
hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt())
print(hashed_password.decode())