|
| 1 | +defmodule Kafkaesque.Health do |
| 2 | + @moduledoc """ |
| 3 | + Health check module for Kafkaesque. |
| 4 | + Provides system health information and readiness checks. |
| 5 | + """ |
| 6 | + |
| 7 | + alias Kafkaesque.Telemetry |
| 8 | + alias Kafkaesque.Topic.Supervisor, as: TopicSupervisor |
| 9 | + |
| 10 | + @doc """ |
| 11 | + Performs a comprehensive health check of the system. |
| 12 | + Returns {:ok, health_info} or {:error, reason} |
| 13 | + """ |
| 14 | + def check do |
| 15 | + checks = [ |
| 16 | + check_registry(), |
| 17 | + check_topics(), |
| 18 | + check_telemetry(), |
| 19 | + check_disk_space(), |
| 20 | + check_memory() |
| 21 | + ] |
| 22 | + |
| 23 | + failed_checks = Enum.filter(checks, fn {status, _, _} -> status == :error end) |
| 24 | + |
| 25 | + if Enum.empty?(failed_checks) do |
| 26 | + {:ok, |
| 27 | + %{ |
| 28 | + status: :healthy, |
| 29 | + timestamp: System.system_time(:millisecond), |
| 30 | + checks: |
| 31 | + Enum.map(checks, fn {status, name, info} -> |
| 32 | + %{name: name, status: status, info: info} |
| 33 | + end) |
| 34 | + }} |
| 35 | + else |
| 36 | + {:error, |
| 37 | + %{ |
| 38 | + status: :unhealthy, |
| 39 | + timestamp: System.system_time(:millisecond), |
| 40 | + checks: |
| 41 | + Enum.map(checks, fn {status, name, info} -> |
| 42 | + %{name: name, status: status, info: info} |
| 43 | + end), |
| 44 | + failed: Enum.map(failed_checks, fn {_, name, _} -> name end) |
| 45 | + }} |
| 46 | + end |
| 47 | + end |
| 48 | + |
| 49 | + @doc """ |
| 50 | + Quick liveness check - returns :ok if the system is running. |
| 51 | + """ |
| 52 | + def alive? do |
| 53 | + :ok |
| 54 | + end |
| 55 | + |
| 56 | + @doc """ |
| 57 | + Readiness check - returns whether the system is ready to accept traffic. |
| 58 | + """ |
| 59 | + def ready? do |
| 60 | + case check() do |
| 61 | + {:ok, _} -> true |
| 62 | + _ -> false |
| 63 | + end |
| 64 | + end |
| 65 | + |
| 66 | + defp check_registry do |
| 67 | + case Process.whereis(Kafkaesque.TopicRegistry) do |
| 68 | + nil -> |
| 69 | + {:error, :registry, %{message: "Registry not running"}} |
| 70 | + |
| 71 | + pid when is_pid(pid) -> |
| 72 | + # Count registered processes |
| 73 | + entries = |
| 74 | + Registry.select(Kafkaesque.TopicRegistry, [{{:"$1", :"$2", :"$3"}, [], [:"$_"]}]) |
| 75 | + |
| 76 | + {:ok, :registry, %{pid: pid, entries: length(entries)}} |
| 77 | + end |
| 78 | + end |
| 79 | + |
| 80 | + defp check_topics do |
| 81 | + topics = TopicSupervisor.list_topics() |
| 82 | + topic_count = length(topics) |
| 83 | + partition_count = Enum.reduce(topics, 0, fn topic, acc -> acc + topic.partitions end) |
| 84 | + |
| 85 | + {:ok, :topics, |
| 86 | + %{ |
| 87 | + count: topic_count, |
| 88 | + partitions: partition_count, |
| 89 | + topics: Enum.map(topics, & &1.name) |
| 90 | + }} |
| 91 | + rescue |
| 92 | + _ -> {:error, :topics, %{message: "Failed to list topics"}} |
| 93 | + end |
| 94 | + |
| 95 | + defp check_telemetry do |
| 96 | + case Process.whereis(Kafkaesque.Telemetry) do |
| 97 | + nil -> |
| 98 | + {:error, :telemetry, %{message: "Telemetry not running"}} |
| 99 | + |
| 100 | + pid when is_pid(pid) -> |
| 101 | + try do |
| 102 | + metrics = Telemetry.get_metrics() |
| 103 | + |
| 104 | + {:ok, :telemetry, |
| 105 | + %{ |
| 106 | + pid: pid, |
| 107 | + messages_per_sec: metrics[:messages_per_sec] || 0, |
| 108 | + bytes_per_sec: metrics[:bytes_per_sec] || 0 |
| 109 | + }} |
| 110 | + rescue |
| 111 | + _ -> {:error, :telemetry, %{message: "Failed to get metrics"}} |
| 112 | + end |
| 113 | + end |
| 114 | + end |
| 115 | + |
| 116 | + defp check_disk_space do |
| 117 | + data_dir = Application.get_env(:kafkaesque_core, :data_dir, "./data") |
| 118 | + |
| 119 | + case File.stat(data_dir) do |
| 120 | + {:ok, _} -> |
| 121 | + # Try to get disk space info (OS-specific) |
| 122 | + case System.cmd("df", ["-k", data_dir], stderr_to_stdout: true) do |
| 123 | + {output, 0} -> |
| 124 | + # Parse df output (simplified) |
| 125 | + lines = String.split(output, "\n") |
| 126 | + |
| 127 | + if length(lines) >= 2 do |
| 128 | + parts = |
| 129 | + lines |
| 130 | + |> Enum.at(1) |
| 131 | + |> String.split() |
| 132 | + |
| 133 | + if length(parts) >= 4 do |
| 134 | + available = (parts |> Enum.at(3) |> String.to_integer()) * 1024 |
| 135 | + |
| 136 | + used_percent = |
| 137 | + parts |> Enum.at(4) |> String.trim_trailing("%") |> String.to_integer() |
| 138 | + |
| 139 | + status = if used_percent > 90, do: :warning, else: :ok |
| 140 | + |
| 141 | + {status, :disk, |
| 142 | + %{ |
| 143 | + data_dir: data_dir, |
| 144 | + available_bytes: available, |
| 145 | + used_percent: used_percent |
| 146 | + }} |
| 147 | + else |
| 148 | + {:ok, :disk, %{data_dir: data_dir, message: "Unable to parse disk info"}} |
| 149 | + end |
| 150 | + else |
| 151 | + {:ok, :disk, %{data_dir: data_dir, message: "Unable to get disk info"}} |
| 152 | + end |
| 153 | + |
| 154 | + _ -> |
| 155 | + {:ok, :disk, %{data_dir: data_dir, exists: true}} |
| 156 | + end |
| 157 | + |
| 158 | + _ -> |
| 159 | + {:error, :disk, %{message: "Data directory does not exist", path: data_dir}} |
| 160 | + end |
| 161 | + end |
| 162 | + |
| 163 | + defp check_memory do |
| 164 | + memory = :erlang.memory() |
| 165 | + total = memory[:total] |
| 166 | + processes = memory[:processes] |
| 167 | + ets = memory[:ets] |
| 168 | + |
| 169 | + # Warning if using more than 1GB |
| 170 | + status = if total > 1_073_741_824, do: :warning, else: :ok |
| 171 | + |
| 172 | + {status, :memory, |
| 173 | + %{ |
| 174 | + total_bytes: total, |
| 175 | + processes_bytes: processes, |
| 176 | + ets_bytes: ets, |
| 177 | + total_mb: div(total, 1024 * 1024) |
| 178 | + }} |
| 179 | + end |
| 180 | + |
| 181 | + @doc """ |
| 182 | + Returns basic system info for monitoring. |
| 183 | + """ |
| 184 | + def info do |
| 185 | + %{ |
| 186 | + version: "0.1.0", |
| 187 | + node: node(), |
| 188 | + uptime_ms: :erlang.statistics(:wall_clock) |> elem(0), |
| 189 | + schedulers: System.schedulers_online(), |
| 190 | + otp_release: :erlang.system_info(:otp_release) |> List.to_string(), |
| 191 | + elixir_version: System.version() |
| 192 | + } |
| 193 | + end |
| 194 | +end |
0 commit comments