From 4985b80589abf92a26e77f9818b87c3988fc604a Mon Sep 17 00:00:00 2001 From: xinyangli Date: Tue, 30 Jul 2024 15:56:02 +0800 Subject: [PATCH] feat: better prometheus integration --- machines/massicot/kanidm-provision.nix | 1 + machines/massicot/services.nix | 29 ++++- modules/nixos/hedgedoc.nix | 2 +- modules/nixos/prometheus.nix | 154 ++++++++++++++++++++----- 4 files changed, 155 insertions(+), 31 deletions(-) diff --git a/machines/massicot/kanidm-provision.nix b/machines/massicot/kanidm-provision.nix index 0fdb7b1..9eb10dd 100644 --- a/machines/massicot/kanidm-provision.nix +++ b/machines/massicot/kanidm-provision.nix @@ -61,6 +61,7 @@ hedgedoc = { displayName = "HedgeDoc"; originUrl = "https://docs.xinyang.life/"; + originLanding = "https://docs.xinyang.life/auth/oauth2"; allowInsecureClientDisablePkce = true; scopeMaps = { hedgedoc-users = [ "openid" "email" "profile" ]; diff --git a/machines/massicot/services.nix b/machines/massicot/services.nix index 6c87d4a..37126ab 100644 --- a/machines/massicot/services.nix +++ b/machines/massicot/services.nix @@ -62,6 +62,19 @@ in group = "kanidm"; }; }; + + services.ntfy-sh = { + enable = true; + group = "caddy"; + settings = { + listen-unix = "/var/run/ntfy-sh/ntfy.sock"; + listen-unix-mode = 432; # octal 0660 + base-url = "https://ntfy.xinyang.life"; + }; + }; + + systemd.services.ntfy-sh.serviceConfig.RuntimeDirectory = "ntfy-sh"; + services.kanidm = { package = pkgs.kanidm.withSecretProvisioning; enableServer = true; @@ -161,7 +174,12 @@ in }; users.groups.git = { }; - + users.users = { + ${config.services.caddy.user}.extraGroups = [ + config.services.ntfy-sh.group + ]; + }; + services.caddy = { enable = true; virtualHosts."xinyang.life:443".extraConfig = '' @@ -191,5 +209,14 @@ in } } ''; + virtualHosts."https://ntfy.xinyang.life".extraConfig = '' + reverse_proxy unix/${config.services.ntfy-sh.settings.listen-unix} + @httpget { + protocol http + method GET + path_regexp ^/([-_a-z0-9]{0,64}$|docs/|static/) + } + redir @httpget https://{host}{uri} + ''; }; } diff --git a/modules/nixos/hedgedoc.nix b/modules/nixos/hedgedoc.nix index 6aa5de2..32e80c4 100644 --- a/modules/nixos/hedgedoc.nix +++ b/modules/nixos/hedgedoc.nix @@ -77,7 +77,7 @@ in reverse_proxy unix/${config.services.hedgedoc.settings.path} ''; }; - users.users.caddy.extraGroups = mkIf cfg.caddy [ "hedgedoc" ]; + users.users.${config.services.caddy.user}.extraGroups = mkIf cfg.caddy [ "hedgedoc" ]; }; } diff --git a/modules/nixos/prometheus.nix b/modules/nixos/prometheus.nix index 9ddd255..40035f3 100644 --- a/modules/nixos/prometheus.nix +++ b/modules/nixos/prometheus.nix @@ -32,15 +32,16 @@ in } ''; services.restic.server.prometheus = cfg.enable; - services.gotosocial.settings = { - metrics-enable = true; + services.gotosocial.settings = mkIf cfg.enable { + metrics-enabled = true; }; services.prometheus = mkIf cfg.enable { enable = true; port = 9091; globalConfig.external_labels = { hostname = config.networking.hostName; }; remoteWrite = mkIf cfg.grafana.enable [ - { name = "grafana"; + { + name = "grafana"; url = "https://prometheus-prod-24-prod-eu-west-2.grafana.net/api/prom/push"; basic_auth = { username = "1340065"; @@ -51,45 +52,140 @@ in exporters = { node = { enable = true; - enabledCollectors = [ "systemd" ]; + enabledCollectors = [ + "conntrack" + "diskstats" + "entropy" + "filefd" + "filesystem" + "loadavg" + "meminfo" + "netdev" + "netstat" + "stat" + "time" + "vmstat" + "systemd" + "logind" + "interrupts" + "ksmd" + ]; port = 9100; }; }; scrapeConfigs = [ - { job_name = "prometheus"; + { + job_name = "prometheus"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.port}" ]; } ]; } - { job_name = "node"; + { + job_name = "node"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; } ]; } ]; + + alertmanager = { + enable = true; + listenAddress = "127.0.0.1"; + extraFlags = [ + "--cluster.advertise-address=127.0.0.1:9093" + ]; + configuration = { + route = { + receiver = "ntfy"; + }; + receivers = [ + { + name = "ntfy"; + webhook_configs = [ + { + url = "${config.services.ntfy-sh.settings.base-url}/prometheus-alerts"; + send_resolved = true; + } + ]; + } + ]; + }; + }; + + alertmanagers = [ + { + scheme = "http"; + path_prefix = "/alertmanager"; + static_configs = [ + { + targets = [ + "${config.services.prometheus.alertmanager.listenAddress}:${toString config.services.prometheus.alertmanager.port}" + ]; + } + ]; + } + ]; + + rules = [ + '' + groups: + - name: system_alerts + rules: + - alert: SystemdFailedUnits + expr: node_systemd_unit_state{state="failed"} > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Systemd has failed units on {{ $labels.instance }}" + description: "There are {{ $value }} failed units on {{ $labels.instance }}. Immediate attention required!" + + - alert: HighLoadAverage + expr: node_load1 > 0.8 * count without (cpu) (node_cpu_seconds_total{mode="idle"}) + for: 1m + labels: + severity: warning + annotations: + summary: "High load average detected on {{ $labels.instance }}" + description: "The 1-minute load average ({{ $value }}) exceeds 80% the number of CPUs." + + - alert: HighTransmitTraffic + expr: rate(node_network_transmit_bytes_total{device!="lo"}[5m]) > 100000000 + for: 1m + labels: + severity: warning + annotations: + summary: "High network transmit traffic on {{ $labels.instance }} ({{ $labels.device }})" + description: "The network interface {{ $labels.device }} on {{ $labels.instance }} is transmitting data at a rate exceeding 100 MB/s for the last 1 minute." + '' + (if config.services.restic.server.enable then + '' + groups: + - name: restic_alerts + '' else "") + ]; }; } - { - services.prometheus.scrapeConfigs = [ - ( mkIf config.services.caddy.enable { - job_name = "caddy"; - static_configs = [ - { targets = [ "localhost:2019" ]; } - ]; - }) - ( mkIf config.services.restic.server.enable { - job_name = "restic"; - static_configs = [ - { targets = [ config.services.restic.server.listenAddress ]; } - ]; - }) - ( mkIf config.services.gotosocial.enable { - job_name = "gotosocial"; - static_configs = [ - { targets = [ "localhost:${toString config.services.gotosocial.settings.port}" ]; } - ]; - }) - ]; - } - ]); + { + services.prometheus.scrapeConfigs = [ + (mkIf config.services.caddy.enable { + job_name = "caddy"; + static_configs = [ + { targets = [ "localhost:2019" ]; } + ]; + }) + (mkIf config.services.restic.server.enable { + job_name = "restic"; + static_configs = [ + { targets = [ config.services.restic.server.listenAddress ]; } + ]; + }) + (mkIf config.services.gotosocial.enable { + job_name = "gotosocial"; + static_configs = [ + { targets = [ "localhost:${toString config.services.gotosocial.settings.port}" ]; } + ]; + }) + ]; + }]); }