From 7360ef53a5681ca5f7192094696c7b14ebaa5b85 Mon Sep 17 00:00:00 2001 From: bneradt Date: Tue, 12 May 2026 16:41:53 -0500 Subject: [PATCH] Add Prometheus v2 labeled stats Prometheus consumers currently only get flat ATS stat names, which makes related counters hard to aggregate and can hide broken output behind parser leniency. The draft v2 output also exposed that risk by letting `completed` look like a request method and by interleaving samples from the same family. This adds a Prometheus v2 response format that groups samples by metric family and derives labels for methods, directions, status codes, cache results, time buckets, and cache volumes. This keeps lifecycle counters such as completed requests as their own metrics while preserving the existing v1 output. This extends the stats_over_http AuTest and Prometheus ingester to validate both the raw v2 exposition and the parser's view of it. This catches split families, missing TYPE metadata, malformed labels, and regressions in the expected labeled samples. --- .../plugins/stats_over_http.en.rst | 24 +- plugins/stats_over_http/stats_over_http.cc | 422 ++++++++++++++++-- ...over_http_prometheus_v2_accept_stderr.gold | 11 + .../stats_over_http_prometheus_v2_stderr.gold | 11 + .../prometheus_stats_ingester.py | 339 +++++++++++++- .../stats_over_http/stats_over_http.test.py | 173 +++++++ 6 files changed, 935 insertions(+), 45 deletions(-) create mode 100644 tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold create mode 100644 tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold diff --git a/doc/admin-guide/plugins/stats_over_http.en.rst b/doc/admin-guide/plugins/stats_over_http.en.rst index 93d82422341..264109cdfbd 100644 --- a/doc/admin-guide/plugins/stats_over_http.en.rst +++ b/doc/admin-guide/plugins/stats_over_http.en.rst @@ -105,21 +105,29 @@ if you wish to have it in CSV format you can do so by passing an ``Accept`` head .. option:: Accept: text/csv -Prometheus formatted output is also supported via the ``Accept`` header: +Prometheus formatted output is also supported via the ``Accept`` header. Version 0.0.4 +(flat metric names) and version 2.0.0 (labeled metrics for better aggregation) +are supported: .. option:: Accept: text/plain; version=0.0.4 +.. option:: Accept: text/plain; version=2.0.0 Alternatively, the output format can be specified as a suffix to the configured path in the HTTP request target. The supported suffixes are ``/json``, -``/csv``, and ``/prometheus``. For example, if the path is set to ``/_stats`` -(the default), you can access the stats in CSV format by using the URL:: +``/csv``, ``/prometheus``, and ``/prometheus_v2``. For example, if the path +is set to ``/_stats`` (the default), you can access the stats in CSV format by +using the URL:: http://host:port/_stats/csv -The Prometheus format can be requested by using the URL:: +The Prometheus version 0.0.4 format (flat) can be requested by using the URL:: http://host:port/_stats/prometheus +The Prometheus v2 labeled format can be requested by using the URL:: + + http://host:port/_stats/prometheus_v2 + The JSON format is the default, but you can also access it explicitly by using the URL:: http://host:port/_stats/json @@ -129,9 +137,11 @@ specify a path suffix, the plugin will return the data in that format regardless the ``Accept`` header. In either case the ``Content-Type`` header returned by ``stats_over_http.so`` will -reflect the content that has been returned: ``text/json``, ``text/csv``, or -``text/plain; version=0.0.4; charset=utf-8`` for JSON, CSV, and Prometheus -formats respectively. +reflect the content that has been returned: ``text/json``, ``text/csv``, +``text/plain; version=0.0.4; charset=utf-8``, or +``text/plain; version=2.0.0; charset=utf-8`` for JSON, CSV, Prometheus v1, and +Prometheus v2 formats respectively. + Stats over http also accepts returning data in gzip or br compressed format per the ``Accept-encoding`` header. If the header is present, the plugin will return the diff --git a/plugins/stats_over_http/stats_over_http.cc b/plugins/stats_over_http/stats_over_http.cc index e927cc68499..29043c8f958 100644 --- a/plugins/stats_over_http/stats_over_http.cc +++ b/plugins/stats_over_http/stats_over_http.cc @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include @@ -92,6 +94,23 @@ const int BROTLI_LGW = 16; static bool integer_counters = false; static bool wrap_counters = false; +#if defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L && (!defined(__clang__) || __clang_major__ > 16) +#define STATS_OVER_HTTP_HAS_CONSTEXPR_STRING 1 +#else +#define STATS_OVER_HTTP_HAS_CONSTEXPR_STRING 0 +#endif + +struct prometheus_v2_metric { + std::string name; + std::string labels; +}; + +struct prometheus_v2_metric_family { + TSRecordDataType data_type = TS_RECORDDATATYPE_NULL; + std::string help; + std::vector samples; +}; + struct config_t { unsigned int recordTypes; std::string stats_path; @@ -103,7 +122,7 @@ struct config_holder_t { config_t *config; }; -enum class output_format_t { JSON_OUTPUT, CSV_OUTPUT, PROMETHEUS_OUTPUT }; +enum class output_format_t { JSON_OUTPUT, CSV_OUTPUT, PROMETHEUS_OUTPUT, PROMETHEUS_V2_OUTPUT }; enum class encoding_format_t { NONE, DEFLATE, GZIP, BR }; int configReloadRequests = 0; @@ -147,11 +166,13 @@ struct stats_state { TSIOBuffer resp_buffer = nullptr; TSIOBufferReader resp_reader = nullptr; - int output_bytes = 0; - int body_written = 0; - output_format_t output_format = output_format_t::JSON_OUTPUT; - encoding_format_t encoding = encoding_format_t::NONE; - z_stream zstrm; + int64_t output_bytes = 0; + int body_written = 0; + output_format_t output_format = output_format_t::JSON_OUTPUT; + encoding_format_t encoding = encoding_format_t::NONE; + z_stream zstrm; + std::unordered_map prometheus_v2_families; + std::vector prometheus_v2_family_order; #if HAVE_BROTLI_ENCODE_H b_stream bstrm; #endif @@ -168,6 +189,9 @@ struct stats_state { static char * nstr(const char *s) { + if (s == nullptr) { + return nullptr; + } char *mys = (char *)TSmalloc(strlen(s) + 1); strcpy(mys, s); return mys; @@ -246,7 +270,9 @@ stats_cleanup(TSCont contp, stats_state *my_state) my_state->resp_buffer = nullptr; } - TSVConnClose(my_state->net_vc); + if (my_state->net_vc != nullptr) { + TSVConnClose(my_state->net_vc); + } delete my_state; TSContDestroy(contp); } @@ -260,10 +286,13 @@ stats_process_accept(TSCont contp, stats_state *my_state) my_state->read_vio = TSVConnRead(my_state->net_vc, contp, my_state->req_buffer, INT64_MAX); } -static int +static int64_t stats_add_data_to_resp_buffer(const char *s, stats_state *my_state) { - int s_len = strlen(s); + if (s == nullptr) { + return 0; + } + int64_t s_len = strlen(s); TSIOBufferWrite(my_state->resp_buffer, s, s_len); @@ -293,6 +322,15 @@ static const char RESP_HEADER_PROMETHEUS_DEFLATE[] = "no-cache\r\n\r\n"; static const char RESP_HEADER_PROMETHEUS_BR[] = "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=0.0.4; " "charset=utf-8\r\nContent-Encoding: br\r\nCache-Control: no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2[] = + "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; charset=utf-8\r\nCache-Control: no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2_GZIP[] = "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; " + "charset=utf-8\r\nContent-Encoding: gzip\r\nCache-Control: no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2_DEFLATE[] = + "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; charset=utf-8\r\nContent-Encoding: deflate\r\nCache-Control: " + "no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2_BR[] = "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; " + "charset=utf-8\r\nContent-Encoding: br\r\nCache-Control: no-cache\r\n\r\n"; static int stats_add_resp_header(stats_state *my_state) @@ -331,6 +369,17 @@ stats_add_resp_header(stats_state *my_state) return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS, my_state); } break; + case output_format_t::PROMETHEUS_V2_OUTPUT: + if (my_state->encoding == encoding_format_t::GZIP) { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2_GZIP, my_state); + } else if (my_state->encoding == encoding_format_t::DEFLATE) { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2_DEFLATE, my_state); + } else if (my_state->encoding == encoding_format_t::BR) { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2_BR, my_state); + } else { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2, my_state); + } + break; } // Not reached. return stats_add_data_to_resp_buffer(RESP_HEADER_JSON, my_state); @@ -482,16 +531,12 @@ csv_out_stat(TSRecordType /* rec_type ATS_UNUSED */, void *edata, int /* registe * @param[in] name The metric name to sanitize. * @return A sanitized metric name. */ -static -// Remove this check when we drop support for pre-13 GCC versions. -#if defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L -// Clang <= 16 doesn't fully support constexpr std::string. -#if !defined(__clang__) || __clang_major__ > 16 - constexpr +#if STATS_OVER_HTTP_HAS_CONSTEXPR_STRING +static constexpr std::string +#else +static std::string #endif -#endif - std::string - sanitize_metric_name_for_prometheus(std::string_view name) +sanitize_metric_name_for_prometheus(std::string_view name) { std::string sanitized_name(name); // If the first character is a digit, prepend an underscore since Prometheus @@ -509,32 +554,302 @@ static return sanitized_name; } +/** Parse a Prometheus v2 metric name and return the base name and labels. + * + * @param[in] name The metric name to parse. + * @return A prometheus_v2_metric struct containing the base name and labels. + */ +static prometheus_v2_metric +parse_metric_v2(std::string_view name) +{ + swoc::TextView name_view{name}; + std::string labels; + std::string base_name; + + auto escape_label = [](std::string_view val) { + size_t escaped_len = 0; + for (char c : val) { + if (c == '"' || c == '\\' || c == '\n') { + escaped_len += 2; + } else { + escaped_len += 1; + } + } + + std::string escaped; + if (escaped_len > 0) { + escaped.reserve(escaped_len); + for (char c : val) { + if (c == '"' || c == '\\') { + escaped += '\\'; + escaped += c; + } else if (c == '\n') { + escaped += "\\n"; + } else { + escaped += c; + } + } + } + return escaped; + }; + + auto add_label = [&](std::string_view key, std::string_view val) { + if (!labels.empty()) { + labels += ", "; + } + labels += key; + labels += "=\""; + labels += escape_label(val); + labels += "\""; + }; + + constexpr std::string_view methods[] = {"get", "post", "head", "put", "delete", "options", "trace", "connect", "push", "purge"}; + constexpr std::string_view directions[] = {"incoming", "outgoing"}; + constexpr std::string_view results[] = {"hit", "miss", "error", "errors", "success", "failure"}; + constexpr std::string_view categories[] = {"volume", "thread", "interface", "net", "host", "port"}; + + auto contains = [](const std::string_view *arr, size_t size, std::string_view token) { + for (size_t i = 0; i < size; ++i) { + if (arr[i] == token) { + return true; + } + } + return false; + }; + + auto take_token = [](swoc::TextView &view) { + size_t sep = view.find_first_of("._[]"); + swoc::TextView token; + + if (sep == swoc::TextView::npos) { + token = view; + view.clear(); + } else { + token = view.prefix(sep); + view.remove_prefix(sep + 1); + } + return token; + }; + + while (!name_view.empty()) { + swoc::TextView token = take_token(name_view); + + if (token.empty()) { + continue; + } + + bool token_handled = false; + + // Status codes (200, 4xx, etc.) + if (token.length() == 3 && (token[0] >= '0' && token[0] <= '9') && ((token[1] >= '0' && token[1] <= '9') || token[1] == 'x') && + ((token[2] >= '0' && token[2] <= '9') || token[2] == 'x')) { + add_label("status", token); + token_handled = true; + } + // Direction (incoming / outgoing) + else if (contains(directions, sizeof(directions) / sizeof(directions[0]), token)) { + add_label("direction", token); + token_handled = true; + } + // Multi-token method categories. + else if (token == "extension" || token == "invalid") { + swoc::TextView next = name_view; + swoc::TextView next_token = take_token(next); + + if (token == "extension" && next_token == "method") { + add_label("method", "extension_method"); + name_view = next; + token_handled = true; + } else if (token == "invalid" && next_token == "client") { + add_label("method", "invalid_client"); + name_view = next; + token_handled = true; + } + } + // Methods + else if (contains(methods, sizeof(methods) / sizeof(methods[0]), token)) { + add_label("method", token); + token_handled = true; + } + // Generic Categories + Index (volume, 0, etc.) + else if (contains(categories, sizeof(categories) / sizeof(categories[0]), token)) { + swoc::TextView next = name_view; + swoc::TextView id = take_token(next); + + bool is_id = !id.empty(); + for (char c : id) { + if (!(c >= '0' && c <= '9') && c != 'x') { + is_id = false; + break; + } + } + if (is_id) { + add_label(token, id); + if (!base_name.empty()) { + base_name += "."; + } + base_name += token; + name_view = next; + token_handled = true; + } + } + // Results (hit, miss) + else if (contains(results, sizeof(results) / sizeof(results[0]), token)) { + // 'hit' and 'miss' are almost always labels. + if (token == "hit" || token == "miss" || !name_view.empty()) { + add_label("result", token); + token_handled = true; + } + } + // Buckets (e.g., 10ms) + else { + constexpr std::string_view units[] = {"ms", "us", "s"}; + for (const auto &unit : units) { + size_t unit_len = unit.length(); + if (token.length() > unit_len && token.substr(token.length() - unit_len) == unit) { + bool all_digits = true; + for (size_t j = 0; j < token.length() - unit_len; ++j) { + if (!(token[j] >= '0' && token[j] <= '9')) { + all_digits = false; + break; + } + } + if (all_digits && token.length() > unit_len) { + add_label("le", token); + token_handled = true; + break; + } + } + } + } + + if (!token_handled) { + if (!base_name.empty()) { + base_name += "."; + } + base_name += token; + } + } + + return {base_name, labels}; +} + +static bool +format_prometheus_v2_sample(std::string &sample, const std::string &name, const std::string &labels, TSRecordDataType data_type, + TSRecordData *datum) +{ + char val_buffer[128]; + int len = 0; + + if (data_type == TS_RECORDDATATYPE_COUNTER) { + len = snprintf(val_buffer, sizeof(val_buffer), "%" PRIu64 "\n", wrap_unsigned_counter(datum->rec_counter)); + } else if (data_type == TS_RECORDDATATYPE_INT) { + len = snprintf(val_buffer, sizeof(val_buffer), "%" PRIu64 "\n", wrap_unsigned_counter(datum->rec_int)); + } else if (data_type == TS_RECORDDATATYPE_FLOAT) { + len = snprintf(val_buffer, sizeof(val_buffer), "%g\n", datum->rec_float); + } + + if (len <= 0 || len >= static_cast(sizeof(val_buffer))) { + return false; + } + + sample.reserve(name.size() + labels.size() + static_cast(len) + 3); + sample += name; + if (!labels.empty()) { + sample += "{"; + sample += labels; + sample += "}"; + } + sample += " "; + sample += val_buffer; + + return true; +} + +static void +prometheus_v2_out_stat(TSRecordType /* rec_type ATS_UNUSED */, void *edata, int /* registered ATS_UNUSED */, const char *name, + TSRecordDataType data_type, TSRecordData *datum) +{ + stats_state *my_state = static_cast(edata); + + if (data_type == TS_RECORDDATATYPE_STRING) { + return; // Prometheus does not support string values. + } + + auto v2 = parse_metric_v2(name); + std::string sanitized_name = sanitize_metric_name_for_prometheus(v2.name); + + if (sanitized_name.empty()) { + return; + } + + std::string sample; + if (!format_prometheus_v2_sample(sample, sanitized_name, v2.labels, data_type, datum)) { + return; + } + + // Note: Prometheus requires all metrics with the same name to have the same type. + // If Traffic Server metrics with different types (e.g., COUNTER and INT) are collapsed + // into the same base name, the first one encountered will determine the reported TYPE. + auto [it, inserted] = my_state->prometheus_v2_families.try_emplace(sanitized_name); + if (inserted) { + it->second.data_type = data_type; + it->second.help = name; + my_state->prometheus_v2_family_order.emplace_back(sanitized_name); + } else { + // Validate type consistency (at least between counter and gauge). + bool prev_is_counter = (it->second.data_type == TS_RECORDDATATYPE_COUNTER); + bool curr_is_counter = (data_type == TS_RECORDDATATYPE_COUNTER); + if (prev_is_counter != curr_is_counter) { + Dbg(dbg_ctl, "Inconsistent types for base metric %s: previously %s, now %s. Labels: %s", sanitized_name.c_str(), + prev_is_counter ? "counter" : "gauge", curr_is_counter ? "counter" : "gauge", v2.labels.c_str()); + } + } + + it->second.samples.emplace_back(std::move(sample)); +} + static void prometheus_out_stat(TSRecordType /* rec_type ATS_UNUSED */, void *edata, int /* registered ATS_UNUSED */, const char *name, TSRecordDataType data_type, TSRecordData *datum) { stats_state *my_state = static_cast(edata); std::string sanitized_name = sanitize_metric_name_for_prometheus(name); - char type_buffer[256]; - char help_buffer[256]; - snprintf(help_buffer, sizeof(help_buffer), "# HELP %s %s\n", sanitized_name.c_str(), name); + if (sanitized_name.empty()) { + return; + } + switch (data_type) { case TS_RECORDDATATYPE_COUNTER: - APPEND(help_buffer); - snprintf(type_buffer, sizeof(type_buffer), "# TYPE %s counter\n", sanitized_name.c_str()); - APPEND(type_buffer); + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(name); + APPEND("\n"); + APPEND("# TYPE "); + APPEND(sanitized_name.c_str()); + APPEND(" counter\n"); APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%" PRIu64, wrap_unsigned_counter(datum->rec_counter)); break; case TS_RECORDDATATYPE_INT: - APPEND(help_buffer); - snprintf(type_buffer, sizeof(type_buffer), "# TYPE %s gauge\n", sanitized_name.c_str()); - APPEND(type_buffer); + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(name); + APPEND("\n"); + APPEND("# TYPE "); + APPEND(sanitized_name.c_str()); + APPEND(" gauge\n"); APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%" PRIu64, wrap_unsigned_counter(datum->rec_int)); break; case TS_RECORDDATATYPE_FLOAT: - APPEND(help_buffer); - APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%f", datum->rec_float); + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(name); + APPEND("\n"); + APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%g", datum->rec_float); break; case TS_RECORDDATATYPE_STRING: Dbg(dbg_ctl, "Prometheus does not support string values, skipping: %s", sanitized_name.c_str()); @@ -644,6 +959,37 @@ prometheus_out_stats(stats_state *my_state) // No version printed, since string stats are not supported by Prometheus. } +static void +prometheus_v2_out_stats(stats_state *my_state) +{ + TSRecordDump((TSRecordType)(TS_RECORDTYPE_PLUGIN | TS_RECORDTYPE_NODE | TS_RECORDTYPE_PROCESS), prometheus_v2_out_stat, my_state); + + for (const auto &sanitized_name : my_state->prometheus_v2_family_order) { + const auto &family = my_state->prometheus_v2_families.at(sanitized_name); + + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(family.help.c_str()); + APPEND("\n"); + + const char *type_str = (family.data_type == TS_RECORDDATATYPE_COUNTER) ? "counter" : "gauge"; + APPEND("# TYPE "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(type_str); + APPEND("\n"); + + for (const auto &sample : family.samples) { + APPEND(sample.c_str()); + } + } + + APPEND("# HELP current_time_epoch_ms Current time in milliseconds since epoch.\n"); + APPEND("# TYPE current_time_epoch_ms gauge\n"); + APPEND_STAT_PROMETHEUS_NUMERIC("current_time_epoch_ms", "%" PRIu64, ms_since_epoch()); +} + static void stats_process_write(TSCont contp, TSEvent event, stats_state *my_state) { @@ -660,6 +1006,9 @@ stats_process_write(TSCont contp, TSEvent event, stats_state *my_state) case output_format_t::PROMETHEUS_OUTPUT: prometheus_out_stats(my_state); break; + case output_format_t::PROMETHEUS_V2_OUTPUT: + prometheus_v2_out_stats(my_state); + break; } if ((my_state->encoding == encoding_format_t::GZIP) || (my_state->encoding == encoding_format_t::DEFLATE)) { @@ -753,6 +1102,8 @@ stats_origin(TSCont contp, TSEvent /* event ATS_UNUSED */, void *edata) format_per_path = output_format_t::CSV_OUTPUT; } else if (request_path_suffix == "/prometheus") { format_per_path = output_format_t::PROMETHEUS_OUTPUT; + } else if (request_path_suffix == "/prometheus_v2") { + format_per_path = output_format_t::PROMETHEUS_V2_OUTPUT; } else { Dbg(dbg_ctl, "Unknown suffix for stats path: %.*s", static_cast(request_path_suffix.length()), request_path_suffix.data()); @@ -796,6 +1147,9 @@ stats_origin(TSCont contp, TSEvent /* event ATS_UNUSED */, void *edata) } else if (!strncasecmp(str, "text/plain; version=0.0.4", len)) { Dbg(dbg_ctl, "Saw text/plain; version=0.0.4 in accept header, sending Prometheus output."); my_state->output_format = output_format_t::PROMETHEUS_OUTPUT; + } else if (!strncasecmp(str, "text/plain; version=2.0.0", len)) { + Dbg(dbg_ctl, "Saw text/plain; version=2.0.0 in accept header, sending Prometheus v2 output."); + my_state->output_format = output_format_t::PROMETHEUS_V2_OUTPUT; } else { Dbg(dbg_ctl, "Saw %.*s in accept header, defaulting to JSON output.", len, str); my_state->output_format = output_format_t::JSON_OUTPUT; @@ -1136,11 +1490,7 @@ config_handler(TSCont cont, TSEvent /* event ATS_UNUSED */, void * /* edata ATS_ // // Compilation time unit tests. // -#ifdef DEBUG -// Remove this check when we drop support for pre-13 GCC versions. -#if defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L -// Clang <= 16 doesn't fully support constexpr std::string. -#if !defined(__clang__) || __clang_major__ > 16 +#if defined(DEBUG) && STATS_OVER_HTTP_HAS_CONSTEXPR_STRING constexpr void test_sanitize_metric_name_for_prometheus() { @@ -1212,6 +1562,4 @@ test_sanitize_metric_name_for_prometheus() static_assert(sanitize_metric_name_for_prometheus("foo [[[bar]]]") == "foo____bar___"); static_assert(sanitize_metric_name_for_prometheus("foo@#$%bar") == "foo____bar"); } -#endif // !defined(__clang__) || __clang_major__ > 16 -#endif // defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L -#endif // DEBUG +#endif // defined(DEBUG) && STATS_OVER_HTTP_HAS_CONSTEXPR_STRING diff --git a/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold new file mode 100644 index 00000000000..bc0e58685e6 --- /dev/null +++ b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold @@ -0,0 +1,11 @@ +`` +> GET /_stats``HTTP/1.1 +`` +< HTTP/1.1 200 OK +< Content-Type: text/plain; version=2.0.0; charset=utf-8 +< Cache-Control: no-cache +< Date:`` +< Age:`` +< Transfer-Encoding: chunked +< Connection:`` +`` diff --git a/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold new file mode 100644 index 00000000000..9da19b0ad7f --- /dev/null +++ b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold @@ -0,0 +1,11 @@ +`` +> GET /_stats/prometheus_v2``HTTP/1.1 +`` +< HTTP/1.1 200 OK +< Content-Type: text/plain; version=2.0.0; charset=utf-8 +< Cache-Control: no-cache +< Date:`` +< Age:`` +< Transfer-Encoding: chunked +< Connection:`` +`` diff --git a/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py b/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py index 16b3701eccb..89b3ea20361 100644 --- a/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py +++ b/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py @@ -16,10 +16,19 @@ # limitations under the License. import argparse +from collections import Counter +import re import sys from urllib.request import urlopen from prometheus_client.parser import text_string_to_metric_families +HELP_RE = re.compile(r"^# HELP (?P[a-zA-Z_:][a-zA-Z0-9_:]*) (?P.*)$") +TYPE_RE = re.compile(r"^# TYPE (?P[a-zA-Z_:][a-zA-Z0-9_:]*) (?P[a-zA-Z]+)$") +SAMPLE_RE = re.compile( + r"^(?P[a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{(?P.*)\})?\s+" + r"(?P(?:[+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)|(?:[+-]?(?:Inf|inf))|(?:NaN|nan))$") +LABEL_RE = re.compile(r'(?P[a-zA-Z_][a-zA-Z0-9_]*)="(?P(?:\\.|[^"\\])*)"') + def parse_args() -> argparse.Namespace: """ @@ -28,6 +37,16 @@ def parse_args() -> argparse.Namespace: :return: Parsed arguments with the 'url' attribute. """ parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--strict-family-metadata", + action="store_true", + help="Fail if parsed metric families are split or emitted without TYPE metadata.", + ) + parser.add_argument( + "--validate-v2-format", + action="store_true", + help="Fail if the raw v2 exposition is not grouped into complete, labeled metric families.", + ) parser.add_argument("url", help="URL to fetch metrics from") return parser.parse_args() @@ -54,7 +73,7 @@ def parse_ats_metrics(text: str) -> list: :return: List of parsed metric families. """ try: - families = text_string_to_metric_families(text) + families = list(text_string_to_metric_families(text)) except Exception as e: raise RuntimeError(f"Failed to parse metrics: {e}") @@ -63,6 +82,310 @@ def parse_ats_metrics(text: str) -> list: return families +def validate_metric_families(families: list) -> None: + """ + Verify each metric family is complete and emitted once. + + Prometheus' parser accepts samples without adjacent HELP/TYPE metadata by + parsing them as unknown families. That is useful leniency, but it can hide + broken exposition output where samples for one metric family are interleaved + with unrelated metrics. + + :param families: List of parsed metric families. + """ + family_counts = Counter(family.name for family in families) + duplicate_families = sorted(name for name, count in family_counts.items() if count > 1) + if duplicate_families: + raise RuntimeError(f"Duplicate metric families found: {', '.join(duplicate_families)}") + + unknown_families = sorted(family.name for family in families if family.type == "unknown") + if unknown_families: + raise RuntimeError(f"Metric families without TYPE metadata found: {', '.join(unknown_families)}") + + +def decode_label_value(value: str, line_no: int) -> str: + """ + Decode a Prometheus label value and reject unsupported escape sequences. + + :param value: Escaped label value without the surrounding quotes. + :param line_no: Line number used for diagnostics. + :return: Decoded label value. + """ + decoded = [] + i = 0 + while i < len(value): + if value[i] != "\\": + decoded.append(value[i]) + i += 1 + continue + + if i + 1 >= len(value): + raise RuntimeError(f"Line {line_no}: label value ends with an incomplete escape") + + escaped = value[i + 1] + if escaped == "n": + decoded.append("\n") + elif escaped in {'"', "\\"}: + decoded.append(escaped) + else: + raise RuntimeError(f"Line {line_no}: unsupported label escape \\{escaped}") + i += 2 + + return "".join(decoded) + + +def parse_labels(labels_text: str | None, line_no: int) -> dict[str, str]: + """ + Parse a Prometheus label block. + + :param labels_text: Label block without braces, or None if absent. + :param line_no: Line number used for diagnostics. + :return: Mapping of label names to decoded values. + """ + if labels_text is None: + return {} + if not labels_text: + raise RuntimeError(f"Line {line_no}: empty label block") + + labels = {} + pos = 0 + while pos < len(labels_text): + match = LABEL_RE.match(labels_text, pos) + if match is None: + raise RuntimeError(f"Line {line_no}: invalid label syntax near: {labels_text[pos:]}") + + name = match.group("name") + if name in labels: + raise RuntimeError(f"Line {line_no}: duplicate label {name}") + labels[name] = decode_label_value(match.group("value"), line_no) + + pos = match.end() + if pos == len(labels_text): + break + if labels_text[pos] != ",": + raise RuntimeError(f"Line {line_no}: expected comma after label {name}") + pos += 1 + if pos < len(labels_text) and labels_text[pos] == " ": + pos += 1 + if pos == len(labels_text): + raise RuntimeError(f"Line {line_no}: trailing comma in label block") + + return labels + + +def require_family(samples_by_family: dict[str, list[dict[str, str]]], family: str) -> list[dict[str, str]]: + """ + Retrieve samples for a required metric family. + + :param samples_by_family: Mapping of family names to their parsed labels. + :param family: Required family name. + :return: Samples for the family. + """ + try: + return samples_by_family[family] + except KeyError: + raise RuntimeError(f"Required metric family missing: {family}") + + +def require_label_values(samples: list[dict[str, str]], family: str, label: str, expected_values: set[str]) -> None: + """ + Verify a family has samples for each expected value of a label. + + :param samples: Parsed labels for all samples in the family. + :param family: Family name used for diagnostics. + :param label: Label name to inspect. + :param expected_values: Required label values. + """ + actual_values = {sample[label] for sample in samples if label in sample} + missing_values = sorted(expected_values - actual_values) + if missing_values: + raise RuntimeError(f"{family} is missing {label} values: {', '.join(missing_values)}") + + +def require_sample(samples: list[dict[str, str]], family: str, required_labels: dict[str, str]) -> None: + """ + Verify a family has a sample containing a set of labels. + + :param samples: Parsed labels for all samples in the family. + :param family: Family name used for diagnostics. + :param required_labels: Required labels and values. + """ + for sample in samples: + if all(sample.get(label) == value for label, value in required_labels.items()): + return + + labels = ", ".join(f'{label}="{value}"' for label, value in required_labels.items()) + raise RuntimeError(f"{family} is missing a sample with labels: {labels}") + + +def validate_prometheus_v2_label_coverage(samples_by_family: dict[str, list[dict[str, str]]]) -> None: + """ + Verify the v2 output exercises the expected label transformations. + + :param samples_by_family: Mapping of family names to their parsed labels. + """ + http_request_samples = require_family(samples_by_family, "proxy_process_http_requests") + require_label_values( + http_request_samples, + "proxy_process_http_requests", + "method", + { + "connect", + "delete", + "extension_method", + "get", + "head", + "invalid_client", + "options", + "post", + "purge", + "push", + "put", + "trace", + }, + ) + require_label_values(http_request_samples, "proxy_process_http_requests", "direction", {"incoming", "outgoing"}) + for sample in http_request_samples: + if set(sample) not in ({"method"}, {"direction"}): + raise RuntimeError(f"proxy_process_http_requests has unexpected labels: {sample}") + + completed_samples = require_family(samples_by_family, "proxy_process_http_completed_requests") + for sample in completed_samples: + if sample: + raise RuntimeError("proxy_process_http_completed_requests should not have labels") + + response_samples = require_family(samples_by_family, "proxy_process_http_responses") + require_label_values(response_samples, "proxy_process_http_responses", "direction", {"incoming"}) + require_label_values( + response_samples, + "proxy_process_http_responses", + "status", + {"000", "100", "1xx", "200", "2xx", "404", "4xx", "500", "5xx"}, + ) + + require_sample( + require_family(samples_by_family, "proxy_process_http_disallowed_continue"), + "proxy_process_http_disallowed_continue", + { + "method": "post", + "status": "100" + }, + ) + require_label_values( + require_family(samples_by_family, "proxy_process_http_cache_ims"), "proxy_process_http_cache_ims", "result", + {"hit", "miss"}) + require_label_values( + require_family(samples_by_family, "proxy_process_http_cache_fresh"), "proxy_process_http_cache_fresh", "result", {"hit"}) + require_sample( + require_family(samples_by_family, "proxy_process_http_transaction_counts_failed"), + "proxy_process_http_transaction_counts_failed", + { + "result": "errors", + "method": "connect" + }, + ) + require_label_values( + require_family(samples_by_family, "proxy_process_eventloop_count"), "proxy_process_eventloop_count", "le", + {"10s", "100s", "1000s"}) + require_label_values( + require_family(samples_by_family, "proxy_process_eventloop_time"), "proxy_process_eventloop_time", "le", + {"0ms", "100ms", "2560ms"}) + require_label_values( + require_family(samples_by_family, "proxy_process_cache_volume_lookup_active"), "proxy_process_cache_volume_lookup_active", + "volume", {"0"}) + require_label_values( + require_family(samples_by_family, "proxy_process_cache_volume_lookup_success"), "proxy_process_cache_volume_lookup_success", + "volume", {"0"}) + + for family, samples in samples_by_family.items(): + for sample in samples: + if sample.get("method") == "completed": + raise RuntimeError(f"{family} incorrectly labels completed as an HTTP method") + + +def validate_prometheus_v2_text(text: str) -> None: + """ + Verify the raw v2 exposition has complete grouped metric families. + + :param text: Raw ATS Prometheus v2 output. + """ + current_name = None + current_type = None + current_has_sample = False + seen_families = set() + samples_by_family = {} + help_count = 0 + type_count = 0 + sample_count = 0 + + for line_no, line in enumerate(text.splitlines(), 1): + if not line: + continue + + help_match = HELP_RE.match(line) + if help_match is not None: + if current_name is not None and not current_has_sample: + raise RuntimeError(f"Line {line_no}: family {current_name} has no samples") + + current_name = help_match.group("name") + if current_name in seen_families: + raise RuntimeError(f"Line {line_no}: duplicate HELP for metric family {current_name}") + seen_families.add(current_name) + samples_by_family[current_name] = [] + current_type = None + current_has_sample = False + help_count += 1 + continue + + type_match = TYPE_RE.match(line) + if type_match is not None: + type_name = type_match.group("name") + if current_name is None: + raise RuntimeError(f"Line {line_no}: TYPE appears before HELP") + if type_name != current_name: + raise RuntimeError(f"Line {line_no}: TYPE name {type_name} does not match HELP name {current_name}") + if current_type is not None: + raise RuntimeError(f"Line {line_no}: duplicate TYPE for metric family {current_name}") + + current_type = type_match.group("type") + if current_type not in ("counter", "gauge"): + raise RuntimeError(f"Line {line_no}: unsupported TYPE for {current_name}: {current_type}") + type_count += 1 + continue + + if line.startswith("#"): + raise RuntimeError(f"Line {line_no}: unsupported metadata line: {line}") + + sample_match = SAMPLE_RE.match(line) + if sample_match is None: + raise RuntimeError(f"Line {line_no}: invalid sample line: {line}") + if current_name is None or current_type is None: + raise RuntimeError(f"Line {line_no}: sample appears before HELP/TYPE") + + sample_name = sample_match.group("name") + expected_names = {current_name} + if current_type == "counter": + expected_names.add(f"{current_name}_total") + if sample_name not in expected_names: + raise RuntimeError(f"Line {line_no}: sample {sample_name} does not belong to family {current_name}") + + labels = parse_labels(sample_match.group("labels"), line_no) + samples_by_family[current_name].append(labels) + current_has_sample = True + sample_count += 1 + + if help_count == 0: + raise RuntimeError("No metric families found") + if current_name is not None and not current_has_sample: + raise RuntimeError(f"Metric family {current_name} has no samples") + if help_count != type_count: + raise RuntimeError(f"HELP/TYPE count mismatch: {help_count} HELP lines, {type_count} TYPE lines") + if sample_count < help_count: + raise RuntimeError(f"Expected at least one sample per family, saw {sample_count} samples for {help_count} families") + + validate_prometheus_v2_label_coverage(samples_by_family) + + def print_metrics(families: list) -> None: """ Print parsed metric families in Prometheus format. @@ -98,12 +421,26 @@ def main() -> int: print(f"Error fetching URL {args.url}: {e}", file=sys.stderr) return 1 + if args.validate_v2_format: + try: + validate_prometheus_v2_text(ats_output) + except RuntimeError as e: + print(f"Error validating Prometheus v2 metrics: {e}", file=sys.stderr) + return 1 + try: families = parse_ats_metrics(ats_output) except RuntimeError as e: print(f"Error parsing ATS metrics: {e}", file=sys.stderr) return 1 + if args.strict_family_metadata: + try: + validate_metric_families(families) + except RuntimeError as e: + print(f"Error validating metric families: {e}", file=sys.stderr) + return 1 + # Parsing issues may not arise until we try to print the metrics. try: print_metrics(families) diff --git a/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py b/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py index ab4b450373c..3b7ea77c15e 100644 --- a/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py +++ b/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py @@ -93,6 +93,133 @@ def __checkPrometheusMetrics(self, p: 'Test.Process', from_prometheus: bool): p.Streams.stdout += Testers.ContainsExpression( 'proxy_process_http_delete_requests 0', 'Verify the successful parsing of Prometheus metrics for a counter.') + def __checkPrometheusV2Metrics(self, p: "Test.Process"): + """Check the Prometheus v2 metrics output. + :param p: The process whose output to check. + """ + p.Streams.stdout += Testers.ContainsExpression( + "# HELP proxy_process_http_requests", + "Output should have a help line for the base metric name.", + ) + p.Streams.stdout += Testers.ContainsExpression( + "# TYPE proxy_process_http_requests counter", + "Output should have a type line for the base metric name.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{method="delete"}', + "Verify that HTTP method labels (GET, POST, DELETE, etc.) are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{method="extension_method"}', + "Verify that multi-token HTTP method labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{method="invalid_client"}', + "Verify that invalid client request labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{direction="incoming"}', + "Verify that direction labels (incoming / outgoing) are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + "proxy_process_http_completed_requests", + "Verify that completed_requests remains its own lifecycle counter.", + ) + p.Streams.stdout += Testers.ExcludesExpression( + 'method="completed"', + "completed is not an HTTP method label.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_cache_fresh{result="hit"}', + "Verify that result labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_disallowed_continue{method="post", status="100"}', + "Verify that status code labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_cache_volume_lookup_active{volume="0"}', + "Verify that volume labels are extracted from volume_N patterns.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_eventloop_count{le="', + "Verify that time buckets are correctly transformed into le labels.", + ) + + def __checkParsedPrometheusV2Metrics(self, p: "Test.Process"): + """Check the Prometheus parser's view of the v2 metrics output. + :param p: The process whose output to check. + """ + p.Streams.stdout += Testers.ContainsExpression( + "# TYPE proxy_process_http_requests counter", + "Prometheus parser should recognize HTTP request metrics as one counter family.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests_total{method="delete"}', + "Parsed output should retain HTTP method labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests_total{method="extension_method"}', + "Parsed output should retain multi-token HTTP method labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests_total{direction="incoming"}', + "Parsed output should retain direction labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + "proxy_process_http_completed_requests_total", + "Parsed output should keep completed_requests as its own counter family.", + ) + p.Streams.stdout += Testers.ExcludesExpression( + 'method="completed"', + "completed should not be parsed as an HTTP method label.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_disallowed_continue_total{method="post",status="100"}', + "Parsed output should preserve multiple labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + "# TYPE proxy_process_http_responses counter", + "Prometheus parser should recognize HTTP response metrics as one counter family.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_responses_total{direction="incoming"}', + "Parsed output should retain direction labels on response metrics.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_responses_total{status="2xx"}', + "Parsed output should retain status code class labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_cache_ims_total{result="miss"}', + "Parsed output should retain cache result labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_transaction_counts_failed_total{result="errors",method="connect"}', + "Parsed output should preserve combined result and method labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_eventloop_count{le="100s"}', + "Parsed output should retain bucket labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_cache_volume_lookup_active{volume="0"}', + "Parsed output should preserve gauge labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_cache_volume_lookup_success_total{volume="0"}', + "Parsed output should preserve counter labels for volume metrics.", + ) + def __testCaseNoAccept(self): tr = Test.AddTestRun('Fetch stats over HTTP in JSON format: no Accept and default path') self.__checkProcessBefore(tr) @@ -127,6 +254,19 @@ def __testCaseAcceptPrometheus(self): tr.Processes.Default.TimeOut = 3 self.__checkProcessAfter(tr) + def __testCaseAcceptPrometheusV2(self): + tr = Test.AddTestRun("Fetch stats over HTTP in Prometheus v2 format via Accept header") + self.__checkProcessBefore(tr) + tr.MakeCurlCommand( + f"-vs -H'Accept: text/plain; version=2.0.0' --http1.1 http://127.0.0.1:{self.ts.Variables.port}/_stats", + ts=self.ts, + ) + tr.Processes.Default.ReturnCode = 0 + self.__checkPrometheusV2Metrics(tr.Processes.Default) + tr.Processes.Default.Streams.stderr = ("gold/stats_over_http_prometheus_v2_accept_stderr.gold") + tr.Processes.Default.TimeOut = 3 + self.__checkProcessAfter(tr) + def __testCasePathJSON(self): tr = Test.AddTestRun('Fetch stats over HTTP in JSON format via /_stats/json') self.__checkProcessBefore(tr) @@ -160,6 +300,19 @@ def __testCasePathPrometheus(self): tr.Processes.Default.TimeOut = 3 self.__checkProcessAfter(tr) + def __testCasePathPrometheusV2(self): + tr = Test.AddTestRun("Fetch stats over HTTP in Prometheus v2 format via /_stats/prometheus_v2") + self.__checkProcessBefore(tr) + tr.MakeCurlCommand( + f"-vs --http1.1 http://127.0.0.1:{self.ts.Variables.port}/_stats/prometheus_v2", + ts=self.ts, + ) + tr.Processes.Default.ReturnCode = 0 + self.__checkPrometheusV2Metrics(tr.Processes.Default) + tr.Processes.Default.Streams.stderr = ("gold/stats_over_http_prometheus_v2_stderr.gold") + tr.Processes.Default.TimeOut = 3 + self.__checkProcessAfter(tr) + def __testCaseAcceptIgnoredIfPathExplicit(self): tr = Test.AddTestRun('Fetch stats over HTTP in Prometheus format with Accept csv header') self.__checkProcessBefore(tr) @@ -184,16 +337,36 @@ def __queryAndParsePrometheusMetrics(self): p.Command = f'{sys.executable} {ingester} http://127.0.0.1:{self.ts.Variables.port}/_stats/prometheus' p.ReturnCode = 0 self.__checkPrometheusMetrics(p, from_prometheus=True) + self.__checkProcessAfter(tr) + + def __queryAndParsePrometheusV2Metrics(self): + """ + Query the ATS stats over HTTP in Prometheus v2 format and parse the output. + """ + tr = Test.AddTestRun('Query and parse Prometheus v2 metrics') + ingester = 'prometheus_stats_ingester.py' + tr.Setup.CopyAs(ingester) + self.__checkProcessBefore(tr) + p = tr.Processes.Default + p.Command = ( + f'{sys.executable} {ingester} --validate-v2-format --strict-family-metadata ' + f'http://127.0.0.1:{self.ts.Variables.port}/_stats/prometheus_v2') + p.ReturnCode = 0 + self.__checkParsedPrometheusV2Metrics(p) + self.__checkProcessAfter(tr) def run(self): self.__testCaseNoAccept() self.__testCaseAcceptCSV() self.__testCaseAcceptPrometheus() + self.__testCaseAcceptPrometheusV2() self.__testCasePathJSON() self.__testCasePathCSV() self.__testCasePathPrometheus() + self.__testCasePathPrometheusV2() self.__testCaseAcceptIgnoredIfPathExplicit() self.__queryAndParsePrometheusMetrics() + self.__queryAndParsePrometheusV2Metrics() StatsOverHttpPluginTest().run()