From 745ce62f88ba3b8add9cc3ce18a47c93ed22f4d0 Mon Sep 17 00:00:00 2001 From: xinyangli Date: Tue, 30 Jul 2024 15:56:02 +0800 Subject: [PATCH] feat: better prometheus integration --- modules/nixos/prometheus.nix | 154 ++++++++++++++++++++++++++++------- 1 file changed, 125 insertions(+), 29 deletions(-) diff --git a/modules/nixos/prometheus.nix b/modules/nixos/prometheus.nix index 9ddd255..3e59480 100644 --- a/modules/nixos/prometheus.nix +++ b/modules/nixos/prometheus.nix @@ -32,15 +32,16 @@ in } ''; services.restic.server.prometheus = cfg.enable; - services.gotosocial.settings = { - metrics-enable = true; + services.gotosocial.settings = mkIf cfg.enable { + metrics-enabled = true; }; services.prometheus = mkIf cfg.enable { enable = true; port = 9091; globalConfig.external_labels = { hostname = config.networking.hostName; }; remoteWrite = mkIf cfg.grafana.enable [ - { name = "grafana"; + { + name = "grafana"; url = "https://prometheus-prod-24-prod-eu-west-2.grafana.net/api/prom/push"; basic_auth = { username = "1340065"; @@ -51,45 +52,140 @@ in exporters = { node = { enable = true; - enabledCollectors = [ "systemd" ]; + enabledCollectors = [ + "conntrack" + "diskstats" + "entropy" + "filefd" + "filesystem" + "loadavg" + "meminfo" + "netdev" + "netstat" + "stat" + "time" + "vmstat" + "systemd" + "logind" + "interrupts" + "ksmd" + ]; port = 9100; }; }; scrapeConfigs = [ - { job_name = "prometheus"; + { + job_name = "prometheus"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.port}" ]; } ]; } - { job_name = "node"; + { + job_name = "node"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; } ]; } ]; + + alertmanager = { + enable = true; + listenAddress = "127.0.0.1"; + extraFlags = [ + "--cluster.advertise-address=127.0.0.1:9093" + ]; + configuration = { + route = { + receiver = "ntfy"; + }; + receivers = [ + { + name = "ntfy"; + webhook_configs = [ + { + url = "https://ntfy.xinyang.life/prometheus-alerts"; + send_resolved = true; + } + ]; + } + ]; + }; + }; + + alertmanagers = [ + { + scheme = "http"; + path_prefix = "/alertmanager"; + static_configs = [ + { + targets = [ + "${config.services.prometheus.alertmanager.listenAddress}:${toString config.services.prometheus.alertmanager.port}" + ]; + } + ]; + } + ]; + + rules = [ + '' + groups: + - name: system_alerts + rules: + - alert: SystemdFailedUnits + expr: node_systemd_unit_state{state="failed"} > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Systemd has failed units on {{ $labels.instance }}" + description: "There are {{ $value }} failed units on {{ $labels.instance }}. Immediate attention required!" + + - alert: HighLoadAverage + expr: node_load1 > 0.8 * count without (cpu) (node_cpu_seconds_total{mode="idle"}) + for: 1m + labels: + severity: warning + annotations: + summary: "High load average detected on {{ $labels.instance }}" + description: "The 1-minute load average ({{ $value }}) exceeds 80% the number of CPUs." + + - alert: HighTransmitTraffic + expr: rate(node_network_transmit_bytes_total{device!="lo"}[5m]) > 100000000 + for: 1m + labels: + severity: warning + annotations: + summary: "High network transmit traffic on {{ $labels.instance }} ({{ $labels.device }})" + description: "The network interface {{ $labels.device }} on {{ $labels.instance }} is transmitting data at a rate exceeding 100 MB/s for the last 1 minute." + '' + (if config.services.restic.server.enable then + '' + groups: + - name: restic_alerts + '' else "") + ]; }; } - { - services.prometheus.scrapeConfigs = [ - ( mkIf config.services.caddy.enable { - job_name = "caddy"; - static_configs = [ - { targets = [ "localhost:2019" ]; } - ]; - }) - ( mkIf config.services.restic.server.enable { - job_name = "restic"; - static_configs = [ - { targets = [ config.services.restic.server.listenAddress ]; } - ]; - }) - ( mkIf config.services.gotosocial.enable { - job_name = "gotosocial"; - static_configs = [ - { targets = [ "localhost:${toString config.services.gotosocial.settings.port}" ]; } - ]; - }) - ]; - } - ]); + { + services.prometheus.scrapeConfigs = [ + (mkIf config.services.caddy.enable { + job_name = "caddy"; + static_configs = [ + { targets = [ "localhost:2019" ]; } + ]; + }) + (mkIf config.services.restic.server.enable { + job_name = "restic"; + static_configs = [ + { targets = [ config.services.restic.server.listenAddress ]; } + ]; + }) + (mkIf config.services.gotosocial.enable { + job_name = "gotosocial"; + static_configs = [ + { targets = [ "localhost:${toString config.services.gotosocial.settings.port}" ]; } + ]; + }) + ]; + }]); }