diff --git a/doc/admin-guide/plugins/stats_over_http.en.rst b/doc/admin-guide/plugins/stats_over_http.en.rst index 93d82422341..264109cdfbd 100644 --- a/doc/admin-guide/plugins/stats_over_http.en.rst +++ b/doc/admin-guide/plugins/stats_over_http.en.rst @@ -105,21 +105,29 @@ if you wish to have it in CSV format you can do so by passing an ``Accept`` head .. option:: Accept: text/csv -Prometheus formatted output is also supported via the ``Accept`` header: +Prometheus formatted output is also supported via the ``Accept`` header. Version 0.0.4 +(flat metric names) and version 2.0.0 (labeled metrics for better aggregation) +are supported: .. option:: Accept: text/plain; version=0.0.4 +.. option:: Accept: text/plain; version=2.0.0 Alternatively, the output format can be specified as a suffix to the configured path in the HTTP request target. The supported suffixes are ``/json``, -``/csv``, and ``/prometheus``. For example, if the path is set to ``/_stats`` -(the default), you can access the stats in CSV format by using the URL:: +``/csv``, ``/prometheus``, and ``/prometheus_v2``. For example, if the path +is set to ``/_stats`` (the default), you can access the stats in CSV format by +using the URL:: http://host:port/_stats/csv -The Prometheus format can be requested by using the URL:: +The Prometheus version 0.0.4 format (flat) can be requested by using the URL:: http://host:port/_stats/prometheus +The Prometheus v2 labeled format can be requested by using the URL:: + + http://host:port/_stats/prometheus_v2 + The JSON format is the default, but you can also access it explicitly by using the URL:: http://host:port/_stats/json @@ -129,9 +137,11 @@ specify a path suffix, the plugin will return the data in that format regardless the ``Accept`` header. In either case the ``Content-Type`` header returned by ``stats_over_http.so`` will -reflect the content that has been returned: ``text/json``, ``text/csv``, or -``text/plain; version=0.0.4; charset=utf-8`` for JSON, CSV, and Prometheus -formats respectively. +reflect the content that has been returned: ``text/json``, ``text/csv``, +``text/plain; version=0.0.4; charset=utf-8``, or +``text/plain; version=2.0.0; charset=utf-8`` for JSON, CSV, Prometheus v1, and +Prometheus v2 formats respectively. + Stats over http also accepts returning data in gzip or br compressed format per the ``Accept-encoding`` header. If the header is present, the plugin will return the diff --git a/plugins/stats_over_http/stats_over_http.cc b/plugins/stats_over_http/stats_over_http.cc index e927cc68499..29043c8f958 100644 --- a/plugins/stats_over_http/stats_over_http.cc +++ b/plugins/stats_over_http/stats_over_http.cc @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include @@ -92,6 +94,23 @@ const int BROTLI_LGW = 16; static bool integer_counters = false; static bool wrap_counters = false; +#if defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L && (!defined(__clang__) || __clang_major__ > 16) +#define STATS_OVER_HTTP_HAS_CONSTEXPR_STRING 1 +#else +#define STATS_OVER_HTTP_HAS_CONSTEXPR_STRING 0 +#endif + +struct prometheus_v2_metric { + std::string name; + std::string labels; +}; + +struct prometheus_v2_metric_family { + TSRecordDataType data_type = TS_RECORDDATATYPE_NULL; + std::string help; + std::vector samples; +}; + struct config_t { unsigned int recordTypes; std::string stats_path; @@ -103,7 +122,7 @@ struct config_holder_t { config_t *config; }; -enum class output_format_t { JSON_OUTPUT, CSV_OUTPUT, PROMETHEUS_OUTPUT }; +enum class output_format_t { JSON_OUTPUT, CSV_OUTPUT, PROMETHEUS_OUTPUT, PROMETHEUS_V2_OUTPUT }; enum class encoding_format_t { NONE, DEFLATE, GZIP, BR }; int configReloadRequests = 0; @@ -147,11 +166,13 @@ struct stats_state { TSIOBuffer resp_buffer = nullptr; TSIOBufferReader resp_reader = nullptr; - int output_bytes = 0; - int body_written = 0; - output_format_t output_format = output_format_t::JSON_OUTPUT; - encoding_format_t encoding = encoding_format_t::NONE; - z_stream zstrm; + int64_t output_bytes = 0; + int body_written = 0; + output_format_t output_format = output_format_t::JSON_OUTPUT; + encoding_format_t encoding = encoding_format_t::NONE; + z_stream zstrm; + std::unordered_map prometheus_v2_families; + std::vector prometheus_v2_family_order; #if HAVE_BROTLI_ENCODE_H b_stream bstrm; #endif @@ -168,6 +189,9 @@ struct stats_state { static char * nstr(const char *s) { + if (s == nullptr) { + return nullptr; + } char *mys = (char *)TSmalloc(strlen(s) + 1); strcpy(mys, s); return mys; @@ -246,7 +270,9 @@ stats_cleanup(TSCont contp, stats_state *my_state) my_state->resp_buffer = nullptr; } - TSVConnClose(my_state->net_vc); + if (my_state->net_vc != nullptr) { + TSVConnClose(my_state->net_vc); + } delete my_state; TSContDestroy(contp); } @@ -260,10 +286,13 @@ stats_process_accept(TSCont contp, stats_state *my_state) my_state->read_vio = TSVConnRead(my_state->net_vc, contp, my_state->req_buffer, INT64_MAX); } -static int +static int64_t stats_add_data_to_resp_buffer(const char *s, stats_state *my_state) { - int s_len = strlen(s); + if (s == nullptr) { + return 0; + } + int64_t s_len = strlen(s); TSIOBufferWrite(my_state->resp_buffer, s, s_len); @@ -293,6 +322,15 @@ static const char RESP_HEADER_PROMETHEUS_DEFLATE[] = "no-cache\r\n\r\n"; static const char RESP_HEADER_PROMETHEUS_BR[] = "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=0.0.4; " "charset=utf-8\r\nContent-Encoding: br\r\nCache-Control: no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2[] = + "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; charset=utf-8\r\nCache-Control: no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2_GZIP[] = "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; " + "charset=utf-8\r\nContent-Encoding: gzip\r\nCache-Control: no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2_DEFLATE[] = + "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; charset=utf-8\r\nContent-Encoding: deflate\r\nCache-Control: " + "no-cache\r\n\r\n"; +static const char RESP_HEADER_PROMETHEUS_V2_BR[] = "HTTP/1.0 200 OK\r\nContent-Type: text/plain; version=2.0.0; " + "charset=utf-8\r\nContent-Encoding: br\r\nCache-Control: no-cache\r\n\r\n"; static int stats_add_resp_header(stats_state *my_state) @@ -331,6 +369,17 @@ stats_add_resp_header(stats_state *my_state) return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS, my_state); } break; + case output_format_t::PROMETHEUS_V2_OUTPUT: + if (my_state->encoding == encoding_format_t::GZIP) { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2_GZIP, my_state); + } else if (my_state->encoding == encoding_format_t::DEFLATE) { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2_DEFLATE, my_state); + } else if (my_state->encoding == encoding_format_t::BR) { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2_BR, my_state); + } else { + return stats_add_data_to_resp_buffer(RESP_HEADER_PROMETHEUS_V2, my_state); + } + break; } // Not reached. return stats_add_data_to_resp_buffer(RESP_HEADER_JSON, my_state); @@ -482,16 +531,12 @@ csv_out_stat(TSRecordType /* rec_type ATS_UNUSED */, void *edata, int /* registe * @param[in] name The metric name to sanitize. * @return A sanitized metric name. */ -static -// Remove this check when we drop support for pre-13 GCC versions. -#if defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L -// Clang <= 16 doesn't fully support constexpr std::string. -#if !defined(__clang__) || __clang_major__ > 16 - constexpr +#if STATS_OVER_HTTP_HAS_CONSTEXPR_STRING +static constexpr std::string +#else +static std::string #endif -#endif - std::string - sanitize_metric_name_for_prometheus(std::string_view name) +sanitize_metric_name_for_prometheus(std::string_view name) { std::string sanitized_name(name); // If the first character is a digit, prepend an underscore since Prometheus @@ -509,32 +554,302 @@ static return sanitized_name; } +/** Parse a Prometheus v2 metric name and return the base name and labels. + * + * @param[in] name The metric name to parse. + * @return A prometheus_v2_metric struct containing the base name and labels. + */ +static prometheus_v2_metric +parse_metric_v2(std::string_view name) +{ + swoc::TextView name_view{name}; + std::string labels; + std::string base_name; + + auto escape_label = [](std::string_view val) { + size_t escaped_len = 0; + for (char c : val) { + if (c == '"' || c == '\\' || c == '\n') { + escaped_len += 2; + } else { + escaped_len += 1; + } + } + + std::string escaped; + if (escaped_len > 0) { + escaped.reserve(escaped_len); + for (char c : val) { + if (c == '"' || c == '\\') { + escaped += '\\'; + escaped += c; + } else if (c == '\n') { + escaped += "\\n"; + } else { + escaped += c; + } + } + } + return escaped; + }; + + auto add_label = [&](std::string_view key, std::string_view val) { + if (!labels.empty()) { + labels += ", "; + } + labels += key; + labels += "=\""; + labels += escape_label(val); + labels += "\""; + }; + + constexpr std::string_view methods[] = {"get", "post", "head", "put", "delete", "options", "trace", "connect", "push", "purge"}; + constexpr std::string_view directions[] = {"incoming", "outgoing"}; + constexpr std::string_view results[] = {"hit", "miss", "error", "errors", "success", "failure"}; + constexpr std::string_view categories[] = {"volume", "thread", "interface", "net", "host", "port"}; + + auto contains = [](const std::string_view *arr, size_t size, std::string_view token) { + for (size_t i = 0; i < size; ++i) { + if (arr[i] == token) { + return true; + } + } + return false; + }; + + auto take_token = [](swoc::TextView &view) { + size_t sep = view.find_first_of("._[]"); + swoc::TextView token; + + if (sep == swoc::TextView::npos) { + token = view; + view.clear(); + } else { + token = view.prefix(sep); + view.remove_prefix(sep + 1); + } + return token; + }; + + while (!name_view.empty()) { + swoc::TextView token = take_token(name_view); + + if (token.empty()) { + continue; + } + + bool token_handled = false; + + // Status codes (200, 4xx, etc.) + if (token.length() == 3 && (token[0] >= '0' && token[0] <= '9') && ((token[1] >= '0' && token[1] <= '9') || token[1] == 'x') && + ((token[2] >= '0' && token[2] <= '9') || token[2] == 'x')) { + add_label("status", token); + token_handled = true; + } + // Direction (incoming / outgoing) + else if (contains(directions, sizeof(directions) / sizeof(directions[0]), token)) { + add_label("direction", token); + token_handled = true; + } + // Multi-token method categories. + else if (token == "extension" || token == "invalid") { + swoc::TextView next = name_view; + swoc::TextView next_token = take_token(next); + + if (token == "extension" && next_token == "method") { + add_label("method", "extension_method"); + name_view = next; + token_handled = true; + } else if (token == "invalid" && next_token == "client") { + add_label("method", "invalid_client"); + name_view = next; + token_handled = true; + } + } + // Methods + else if (contains(methods, sizeof(methods) / sizeof(methods[0]), token)) { + add_label("method", token); + token_handled = true; + } + // Generic Categories + Index (volume, 0, etc.) + else if (contains(categories, sizeof(categories) / sizeof(categories[0]), token)) { + swoc::TextView next = name_view; + swoc::TextView id = take_token(next); + + bool is_id = !id.empty(); + for (char c : id) { + if (!(c >= '0' && c <= '9') && c != 'x') { + is_id = false; + break; + } + } + if (is_id) { + add_label(token, id); + if (!base_name.empty()) { + base_name += "."; + } + base_name += token; + name_view = next; + token_handled = true; + } + } + // Results (hit, miss) + else if (contains(results, sizeof(results) / sizeof(results[0]), token)) { + // 'hit' and 'miss' are almost always labels. + if (token == "hit" || token == "miss" || !name_view.empty()) { + add_label("result", token); + token_handled = true; + } + } + // Buckets (e.g., 10ms) + else { + constexpr std::string_view units[] = {"ms", "us", "s"}; + for (const auto &unit : units) { + size_t unit_len = unit.length(); + if (token.length() > unit_len && token.substr(token.length() - unit_len) == unit) { + bool all_digits = true; + for (size_t j = 0; j < token.length() - unit_len; ++j) { + if (!(token[j] >= '0' && token[j] <= '9')) { + all_digits = false; + break; + } + } + if (all_digits && token.length() > unit_len) { + add_label("le", token); + token_handled = true; + break; + } + } + } + } + + if (!token_handled) { + if (!base_name.empty()) { + base_name += "."; + } + base_name += token; + } + } + + return {base_name, labels}; +} + +static bool +format_prometheus_v2_sample(std::string &sample, const std::string &name, const std::string &labels, TSRecordDataType data_type, + TSRecordData *datum) +{ + char val_buffer[128]; + int len = 0; + + if (data_type == TS_RECORDDATATYPE_COUNTER) { + len = snprintf(val_buffer, sizeof(val_buffer), "%" PRIu64 "\n", wrap_unsigned_counter(datum->rec_counter)); + } else if (data_type == TS_RECORDDATATYPE_INT) { + len = snprintf(val_buffer, sizeof(val_buffer), "%" PRIu64 "\n", wrap_unsigned_counter(datum->rec_int)); + } else if (data_type == TS_RECORDDATATYPE_FLOAT) { + len = snprintf(val_buffer, sizeof(val_buffer), "%g\n", datum->rec_float); + } + + if (len <= 0 || len >= static_cast(sizeof(val_buffer))) { + return false; + } + + sample.reserve(name.size() + labels.size() + static_cast(len) + 3); + sample += name; + if (!labels.empty()) { + sample += "{"; + sample += labels; + sample += "}"; + } + sample += " "; + sample += val_buffer; + + return true; +} + +static void +prometheus_v2_out_stat(TSRecordType /* rec_type ATS_UNUSED */, void *edata, int /* registered ATS_UNUSED */, const char *name, + TSRecordDataType data_type, TSRecordData *datum) +{ + stats_state *my_state = static_cast(edata); + + if (data_type == TS_RECORDDATATYPE_STRING) { + return; // Prometheus does not support string values. + } + + auto v2 = parse_metric_v2(name); + std::string sanitized_name = sanitize_metric_name_for_prometheus(v2.name); + + if (sanitized_name.empty()) { + return; + } + + std::string sample; + if (!format_prometheus_v2_sample(sample, sanitized_name, v2.labels, data_type, datum)) { + return; + } + + // Note: Prometheus requires all metrics with the same name to have the same type. + // If Traffic Server metrics with different types (e.g., COUNTER and INT) are collapsed + // into the same base name, the first one encountered will determine the reported TYPE. + auto [it, inserted] = my_state->prometheus_v2_families.try_emplace(sanitized_name); + if (inserted) { + it->second.data_type = data_type; + it->second.help = name; + my_state->prometheus_v2_family_order.emplace_back(sanitized_name); + } else { + // Validate type consistency (at least between counter and gauge). + bool prev_is_counter = (it->second.data_type == TS_RECORDDATATYPE_COUNTER); + bool curr_is_counter = (data_type == TS_RECORDDATATYPE_COUNTER); + if (prev_is_counter != curr_is_counter) { + Dbg(dbg_ctl, "Inconsistent types for base metric %s: previously %s, now %s. Labels: %s", sanitized_name.c_str(), + prev_is_counter ? "counter" : "gauge", curr_is_counter ? "counter" : "gauge", v2.labels.c_str()); + } + } + + it->second.samples.emplace_back(std::move(sample)); +} + static void prometheus_out_stat(TSRecordType /* rec_type ATS_UNUSED */, void *edata, int /* registered ATS_UNUSED */, const char *name, TSRecordDataType data_type, TSRecordData *datum) { stats_state *my_state = static_cast(edata); std::string sanitized_name = sanitize_metric_name_for_prometheus(name); - char type_buffer[256]; - char help_buffer[256]; - snprintf(help_buffer, sizeof(help_buffer), "# HELP %s %s\n", sanitized_name.c_str(), name); + if (sanitized_name.empty()) { + return; + } + switch (data_type) { case TS_RECORDDATATYPE_COUNTER: - APPEND(help_buffer); - snprintf(type_buffer, sizeof(type_buffer), "# TYPE %s counter\n", sanitized_name.c_str()); - APPEND(type_buffer); + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(name); + APPEND("\n"); + APPEND("# TYPE "); + APPEND(sanitized_name.c_str()); + APPEND(" counter\n"); APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%" PRIu64, wrap_unsigned_counter(datum->rec_counter)); break; case TS_RECORDDATATYPE_INT: - APPEND(help_buffer); - snprintf(type_buffer, sizeof(type_buffer), "# TYPE %s gauge\n", sanitized_name.c_str()); - APPEND(type_buffer); + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(name); + APPEND("\n"); + APPEND("# TYPE "); + APPEND(sanitized_name.c_str()); + APPEND(" gauge\n"); APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%" PRIu64, wrap_unsigned_counter(datum->rec_int)); break; case TS_RECORDDATATYPE_FLOAT: - APPEND(help_buffer); - APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%f", datum->rec_float); + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(name); + APPEND("\n"); + APPEND_STAT_PROMETHEUS_NUMERIC(sanitized_name.c_str(), "%g", datum->rec_float); break; case TS_RECORDDATATYPE_STRING: Dbg(dbg_ctl, "Prometheus does not support string values, skipping: %s", sanitized_name.c_str()); @@ -644,6 +959,37 @@ prometheus_out_stats(stats_state *my_state) // No version printed, since string stats are not supported by Prometheus. } +static void +prometheus_v2_out_stats(stats_state *my_state) +{ + TSRecordDump((TSRecordType)(TS_RECORDTYPE_PLUGIN | TS_RECORDTYPE_NODE | TS_RECORDTYPE_PROCESS), prometheus_v2_out_stat, my_state); + + for (const auto &sanitized_name : my_state->prometheus_v2_family_order) { + const auto &family = my_state->prometheus_v2_families.at(sanitized_name); + + APPEND("# HELP "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(family.help.c_str()); + APPEND("\n"); + + const char *type_str = (family.data_type == TS_RECORDDATATYPE_COUNTER) ? "counter" : "gauge"; + APPEND("# TYPE "); + APPEND(sanitized_name.c_str()); + APPEND(" "); + APPEND(type_str); + APPEND("\n"); + + for (const auto &sample : family.samples) { + APPEND(sample.c_str()); + } + } + + APPEND("# HELP current_time_epoch_ms Current time in milliseconds since epoch.\n"); + APPEND("# TYPE current_time_epoch_ms gauge\n"); + APPEND_STAT_PROMETHEUS_NUMERIC("current_time_epoch_ms", "%" PRIu64, ms_since_epoch()); +} + static void stats_process_write(TSCont contp, TSEvent event, stats_state *my_state) { @@ -660,6 +1006,9 @@ stats_process_write(TSCont contp, TSEvent event, stats_state *my_state) case output_format_t::PROMETHEUS_OUTPUT: prometheus_out_stats(my_state); break; + case output_format_t::PROMETHEUS_V2_OUTPUT: + prometheus_v2_out_stats(my_state); + break; } if ((my_state->encoding == encoding_format_t::GZIP) || (my_state->encoding == encoding_format_t::DEFLATE)) { @@ -753,6 +1102,8 @@ stats_origin(TSCont contp, TSEvent /* event ATS_UNUSED */, void *edata) format_per_path = output_format_t::CSV_OUTPUT; } else if (request_path_suffix == "/prometheus") { format_per_path = output_format_t::PROMETHEUS_OUTPUT; + } else if (request_path_suffix == "/prometheus_v2") { + format_per_path = output_format_t::PROMETHEUS_V2_OUTPUT; } else { Dbg(dbg_ctl, "Unknown suffix for stats path: %.*s", static_cast(request_path_suffix.length()), request_path_suffix.data()); @@ -796,6 +1147,9 @@ stats_origin(TSCont contp, TSEvent /* event ATS_UNUSED */, void *edata) } else if (!strncasecmp(str, "text/plain; version=0.0.4", len)) { Dbg(dbg_ctl, "Saw text/plain; version=0.0.4 in accept header, sending Prometheus output."); my_state->output_format = output_format_t::PROMETHEUS_OUTPUT; + } else if (!strncasecmp(str, "text/plain; version=2.0.0", len)) { + Dbg(dbg_ctl, "Saw text/plain; version=2.0.0 in accept header, sending Prometheus v2 output."); + my_state->output_format = output_format_t::PROMETHEUS_V2_OUTPUT; } else { Dbg(dbg_ctl, "Saw %.*s in accept header, defaulting to JSON output.", len, str); my_state->output_format = output_format_t::JSON_OUTPUT; @@ -1136,11 +1490,7 @@ config_handler(TSCont cont, TSEvent /* event ATS_UNUSED */, void * /* edata ATS_ // // Compilation time unit tests. // -#ifdef DEBUG -// Remove this check when we drop support for pre-13 GCC versions. -#if defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L -// Clang <= 16 doesn't fully support constexpr std::string. -#if !defined(__clang__) || __clang_major__ > 16 +#if defined(DEBUG) && STATS_OVER_HTTP_HAS_CONSTEXPR_STRING constexpr void test_sanitize_metric_name_for_prometheus() { @@ -1212,6 +1562,4 @@ test_sanitize_metric_name_for_prometheus() static_assert(sanitize_metric_name_for_prometheus("foo [[[bar]]]") == "foo____bar___"); static_assert(sanitize_metric_name_for_prometheus("foo@#$%bar") == "foo____bar"); } -#endif // !defined(__clang__) || __clang_major__ > 16 -#endif // defined(__cpp_lib_constexpr_string) && __cpp_lib_constexpr_string >= 201907L -#endif // DEBUG +#endif // defined(DEBUG) && STATS_OVER_HTTP_HAS_CONSTEXPR_STRING diff --git a/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold new file mode 100644 index 00000000000..bc0e58685e6 --- /dev/null +++ b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_accept_stderr.gold @@ -0,0 +1,11 @@ +`` +> GET /_stats``HTTP/1.1 +`` +< HTTP/1.1 200 OK +< Content-Type: text/plain; version=2.0.0; charset=utf-8 +< Cache-Control: no-cache +< Date:`` +< Age:`` +< Transfer-Encoding: chunked +< Connection:`` +`` diff --git a/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold new file mode 100644 index 00000000000..9da19b0ad7f --- /dev/null +++ b/tests/gold_tests/pluginTest/stats_over_http/gold/stats_over_http_prometheus_v2_stderr.gold @@ -0,0 +1,11 @@ +`` +> GET /_stats/prometheus_v2``HTTP/1.1 +`` +< HTTP/1.1 200 OK +< Content-Type: text/plain; version=2.0.0; charset=utf-8 +< Cache-Control: no-cache +< Date:`` +< Age:`` +< Transfer-Encoding: chunked +< Connection:`` +`` diff --git a/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py b/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py index 16b3701eccb..89b3ea20361 100644 --- a/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py +++ b/tests/gold_tests/pluginTest/stats_over_http/prometheus_stats_ingester.py @@ -16,10 +16,19 @@ # limitations under the License. import argparse +from collections import Counter +import re import sys from urllib.request import urlopen from prometheus_client.parser import text_string_to_metric_families +HELP_RE = re.compile(r"^# HELP (?P[a-zA-Z_:][a-zA-Z0-9_:]*) (?P.*)$") +TYPE_RE = re.compile(r"^# TYPE (?P[a-zA-Z_:][a-zA-Z0-9_:]*) (?P[a-zA-Z]+)$") +SAMPLE_RE = re.compile( + r"^(?P[a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{(?P.*)\})?\s+" + r"(?P(?:[+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)|(?:[+-]?(?:Inf|inf))|(?:NaN|nan))$") +LABEL_RE = re.compile(r'(?P[a-zA-Z_][a-zA-Z0-9_]*)="(?P(?:\\.|[^"\\])*)"') + def parse_args() -> argparse.Namespace: """ @@ -28,6 +37,16 @@ def parse_args() -> argparse.Namespace: :return: Parsed arguments with the 'url' attribute. """ parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--strict-family-metadata", + action="store_true", + help="Fail if parsed metric families are split or emitted without TYPE metadata.", + ) + parser.add_argument( + "--validate-v2-format", + action="store_true", + help="Fail if the raw v2 exposition is not grouped into complete, labeled metric families.", + ) parser.add_argument("url", help="URL to fetch metrics from") return parser.parse_args() @@ -54,7 +73,7 @@ def parse_ats_metrics(text: str) -> list: :return: List of parsed metric families. """ try: - families = text_string_to_metric_families(text) + families = list(text_string_to_metric_families(text)) except Exception as e: raise RuntimeError(f"Failed to parse metrics: {e}") @@ -63,6 +82,310 @@ def parse_ats_metrics(text: str) -> list: return families +def validate_metric_families(families: list) -> None: + """ + Verify each metric family is complete and emitted once. + + Prometheus' parser accepts samples without adjacent HELP/TYPE metadata by + parsing them as unknown families. That is useful leniency, but it can hide + broken exposition output where samples for one metric family are interleaved + with unrelated metrics. + + :param families: List of parsed metric families. + """ + family_counts = Counter(family.name for family in families) + duplicate_families = sorted(name for name, count in family_counts.items() if count > 1) + if duplicate_families: + raise RuntimeError(f"Duplicate metric families found: {', '.join(duplicate_families)}") + + unknown_families = sorted(family.name for family in families if family.type == "unknown") + if unknown_families: + raise RuntimeError(f"Metric families without TYPE metadata found: {', '.join(unknown_families)}") + + +def decode_label_value(value: str, line_no: int) -> str: + """ + Decode a Prometheus label value and reject unsupported escape sequences. + + :param value: Escaped label value without the surrounding quotes. + :param line_no: Line number used for diagnostics. + :return: Decoded label value. + """ + decoded = [] + i = 0 + while i < len(value): + if value[i] != "\\": + decoded.append(value[i]) + i += 1 + continue + + if i + 1 >= len(value): + raise RuntimeError(f"Line {line_no}: label value ends with an incomplete escape") + + escaped = value[i + 1] + if escaped == "n": + decoded.append("\n") + elif escaped in {'"', "\\"}: + decoded.append(escaped) + else: + raise RuntimeError(f"Line {line_no}: unsupported label escape \\{escaped}") + i += 2 + + return "".join(decoded) + + +def parse_labels(labels_text: str | None, line_no: int) -> dict[str, str]: + """ + Parse a Prometheus label block. + + :param labels_text: Label block without braces, or None if absent. + :param line_no: Line number used for diagnostics. + :return: Mapping of label names to decoded values. + """ + if labels_text is None: + return {} + if not labels_text: + raise RuntimeError(f"Line {line_no}: empty label block") + + labels = {} + pos = 0 + while pos < len(labels_text): + match = LABEL_RE.match(labels_text, pos) + if match is None: + raise RuntimeError(f"Line {line_no}: invalid label syntax near: {labels_text[pos:]}") + + name = match.group("name") + if name in labels: + raise RuntimeError(f"Line {line_no}: duplicate label {name}") + labels[name] = decode_label_value(match.group("value"), line_no) + + pos = match.end() + if pos == len(labels_text): + break + if labels_text[pos] != ",": + raise RuntimeError(f"Line {line_no}: expected comma after label {name}") + pos += 1 + if pos < len(labels_text) and labels_text[pos] == " ": + pos += 1 + if pos == len(labels_text): + raise RuntimeError(f"Line {line_no}: trailing comma in label block") + + return labels + + +def require_family(samples_by_family: dict[str, list[dict[str, str]]], family: str) -> list[dict[str, str]]: + """ + Retrieve samples for a required metric family. + + :param samples_by_family: Mapping of family names to their parsed labels. + :param family: Required family name. + :return: Samples for the family. + """ + try: + return samples_by_family[family] + except KeyError: + raise RuntimeError(f"Required metric family missing: {family}") + + +def require_label_values(samples: list[dict[str, str]], family: str, label: str, expected_values: set[str]) -> None: + """ + Verify a family has samples for each expected value of a label. + + :param samples: Parsed labels for all samples in the family. + :param family: Family name used for diagnostics. + :param label: Label name to inspect. + :param expected_values: Required label values. + """ + actual_values = {sample[label] for sample in samples if label in sample} + missing_values = sorted(expected_values - actual_values) + if missing_values: + raise RuntimeError(f"{family} is missing {label} values: {', '.join(missing_values)}") + + +def require_sample(samples: list[dict[str, str]], family: str, required_labels: dict[str, str]) -> None: + """ + Verify a family has a sample containing a set of labels. + + :param samples: Parsed labels for all samples in the family. + :param family: Family name used for diagnostics. + :param required_labels: Required labels and values. + """ + for sample in samples: + if all(sample.get(label) == value for label, value in required_labels.items()): + return + + labels = ", ".join(f'{label}="{value}"' for label, value in required_labels.items()) + raise RuntimeError(f"{family} is missing a sample with labels: {labels}") + + +def validate_prometheus_v2_label_coverage(samples_by_family: dict[str, list[dict[str, str]]]) -> None: + """ + Verify the v2 output exercises the expected label transformations. + + :param samples_by_family: Mapping of family names to their parsed labels. + """ + http_request_samples = require_family(samples_by_family, "proxy_process_http_requests") + require_label_values( + http_request_samples, + "proxy_process_http_requests", + "method", + { + "connect", + "delete", + "extension_method", + "get", + "head", + "invalid_client", + "options", + "post", + "purge", + "push", + "put", + "trace", + }, + ) + require_label_values(http_request_samples, "proxy_process_http_requests", "direction", {"incoming", "outgoing"}) + for sample in http_request_samples: + if set(sample) not in ({"method"}, {"direction"}): + raise RuntimeError(f"proxy_process_http_requests has unexpected labels: {sample}") + + completed_samples = require_family(samples_by_family, "proxy_process_http_completed_requests") + for sample in completed_samples: + if sample: + raise RuntimeError("proxy_process_http_completed_requests should not have labels") + + response_samples = require_family(samples_by_family, "proxy_process_http_responses") + require_label_values(response_samples, "proxy_process_http_responses", "direction", {"incoming"}) + require_label_values( + response_samples, + "proxy_process_http_responses", + "status", + {"000", "100", "1xx", "200", "2xx", "404", "4xx", "500", "5xx"}, + ) + + require_sample( + require_family(samples_by_family, "proxy_process_http_disallowed_continue"), + "proxy_process_http_disallowed_continue", + { + "method": "post", + "status": "100" + }, + ) + require_label_values( + require_family(samples_by_family, "proxy_process_http_cache_ims"), "proxy_process_http_cache_ims", "result", + {"hit", "miss"}) + require_label_values( + require_family(samples_by_family, "proxy_process_http_cache_fresh"), "proxy_process_http_cache_fresh", "result", {"hit"}) + require_sample( + require_family(samples_by_family, "proxy_process_http_transaction_counts_failed"), + "proxy_process_http_transaction_counts_failed", + { + "result": "errors", + "method": "connect" + }, + ) + require_label_values( + require_family(samples_by_family, "proxy_process_eventloop_count"), "proxy_process_eventloop_count", "le", + {"10s", "100s", "1000s"}) + require_label_values( + require_family(samples_by_family, "proxy_process_eventloop_time"), "proxy_process_eventloop_time", "le", + {"0ms", "100ms", "2560ms"}) + require_label_values( + require_family(samples_by_family, "proxy_process_cache_volume_lookup_active"), "proxy_process_cache_volume_lookup_active", + "volume", {"0"}) + require_label_values( + require_family(samples_by_family, "proxy_process_cache_volume_lookup_success"), "proxy_process_cache_volume_lookup_success", + "volume", {"0"}) + + for family, samples in samples_by_family.items(): + for sample in samples: + if sample.get("method") == "completed": + raise RuntimeError(f"{family} incorrectly labels completed as an HTTP method") + + +def validate_prometheus_v2_text(text: str) -> None: + """ + Verify the raw v2 exposition has complete grouped metric families. + + :param text: Raw ATS Prometheus v2 output. + """ + current_name = None + current_type = None + current_has_sample = False + seen_families = set() + samples_by_family = {} + help_count = 0 + type_count = 0 + sample_count = 0 + + for line_no, line in enumerate(text.splitlines(), 1): + if not line: + continue + + help_match = HELP_RE.match(line) + if help_match is not None: + if current_name is not None and not current_has_sample: + raise RuntimeError(f"Line {line_no}: family {current_name} has no samples") + + current_name = help_match.group("name") + if current_name in seen_families: + raise RuntimeError(f"Line {line_no}: duplicate HELP for metric family {current_name}") + seen_families.add(current_name) + samples_by_family[current_name] = [] + current_type = None + current_has_sample = False + help_count += 1 + continue + + type_match = TYPE_RE.match(line) + if type_match is not None: + type_name = type_match.group("name") + if current_name is None: + raise RuntimeError(f"Line {line_no}: TYPE appears before HELP") + if type_name != current_name: + raise RuntimeError(f"Line {line_no}: TYPE name {type_name} does not match HELP name {current_name}") + if current_type is not None: + raise RuntimeError(f"Line {line_no}: duplicate TYPE for metric family {current_name}") + + current_type = type_match.group("type") + if current_type not in ("counter", "gauge"): + raise RuntimeError(f"Line {line_no}: unsupported TYPE for {current_name}: {current_type}") + type_count += 1 + continue + + if line.startswith("#"): + raise RuntimeError(f"Line {line_no}: unsupported metadata line: {line}") + + sample_match = SAMPLE_RE.match(line) + if sample_match is None: + raise RuntimeError(f"Line {line_no}: invalid sample line: {line}") + if current_name is None or current_type is None: + raise RuntimeError(f"Line {line_no}: sample appears before HELP/TYPE") + + sample_name = sample_match.group("name") + expected_names = {current_name} + if current_type == "counter": + expected_names.add(f"{current_name}_total") + if sample_name not in expected_names: + raise RuntimeError(f"Line {line_no}: sample {sample_name} does not belong to family {current_name}") + + labels = parse_labels(sample_match.group("labels"), line_no) + samples_by_family[current_name].append(labels) + current_has_sample = True + sample_count += 1 + + if help_count == 0: + raise RuntimeError("No metric families found") + if current_name is not None and not current_has_sample: + raise RuntimeError(f"Metric family {current_name} has no samples") + if help_count != type_count: + raise RuntimeError(f"HELP/TYPE count mismatch: {help_count} HELP lines, {type_count} TYPE lines") + if sample_count < help_count: + raise RuntimeError(f"Expected at least one sample per family, saw {sample_count} samples for {help_count} families") + + validate_prometheus_v2_label_coverage(samples_by_family) + + def print_metrics(families: list) -> None: """ Print parsed metric families in Prometheus format. @@ -98,12 +421,26 @@ def main() -> int: print(f"Error fetching URL {args.url}: {e}", file=sys.stderr) return 1 + if args.validate_v2_format: + try: + validate_prometheus_v2_text(ats_output) + except RuntimeError as e: + print(f"Error validating Prometheus v2 metrics: {e}", file=sys.stderr) + return 1 + try: families = parse_ats_metrics(ats_output) except RuntimeError as e: print(f"Error parsing ATS metrics: {e}", file=sys.stderr) return 1 + if args.strict_family_metadata: + try: + validate_metric_families(families) + except RuntimeError as e: + print(f"Error validating metric families: {e}", file=sys.stderr) + return 1 + # Parsing issues may not arise until we try to print the metrics. try: print_metrics(families) diff --git a/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py b/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py index ab4b450373c..3b7ea77c15e 100644 --- a/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py +++ b/tests/gold_tests/pluginTest/stats_over_http/stats_over_http.test.py @@ -93,6 +93,133 @@ def __checkPrometheusMetrics(self, p: 'Test.Process', from_prometheus: bool): p.Streams.stdout += Testers.ContainsExpression( 'proxy_process_http_delete_requests 0', 'Verify the successful parsing of Prometheus metrics for a counter.') + def __checkPrometheusV2Metrics(self, p: "Test.Process"): + """Check the Prometheus v2 metrics output. + :param p: The process whose output to check. + """ + p.Streams.stdout += Testers.ContainsExpression( + "# HELP proxy_process_http_requests", + "Output should have a help line for the base metric name.", + ) + p.Streams.stdout += Testers.ContainsExpression( + "# TYPE proxy_process_http_requests counter", + "Output should have a type line for the base metric name.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{method="delete"}', + "Verify that HTTP method labels (GET, POST, DELETE, etc.) are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{method="extension_method"}', + "Verify that multi-token HTTP method labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{method="invalid_client"}', + "Verify that invalid client request labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests{direction="incoming"}', + "Verify that direction labels (incoming / outgoing) are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + "proxy_process_http_completed_requests", + "Verify that completed_requests remains its own lifecycle counter.", + ) + p.Streams.stdout += Testers.ExcludesExpression( + 'method="completed"', + "completed is not an HTTP method label.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_cache_fresh{result="hit"}', + "Verify that result labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_disallowed_continue{method="post", status="100"}', + "Verify that status code labels are extracted correctly.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_cache_volume_lookup_active{volume="0"}', + "Verify that volume labels are extracted from volume_N patterns.", + ) + + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_eventloop_count{le="', + "Verify that time buckets are correctly transformed into le labels.", + ) + + def __checkParsedPrometheusV2Metrics(self, p: "Test.Process"): + """Check the Prometheus parser's view of the v2 metrics output. + :param p: The process whose output to check. + """ + p.Streams.stdout += Testers.ContainsExpression( + "# TYPE proxy_process_http_requests counter", + "Prometheus parser should recognize HTTP request metrics as one counter family.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests_total{method="delete"}', + "Parsed output should retain HTTP method labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests_total{method="extension_method"}', + "Parsed output should retain multi-token HTTP method labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_requests_total{direction="incoming"}', + "Parsed output should retain direction labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + "proxy_process_http_completed_requests_total", + "Parsed output should keep completed_requests as its own counter family.", + ) + p.Streams.stdout += Testers.ExcludesExpression( + 'method="completed"', + "completed should not be parsed as an HTTP method label.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_disallowed_continue_total{method="post",status="100"}', + "Parsed output should preserve multiple labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + "# TYPE proxy_process_http_responses counter", + "Prometheus parser should recognize HTTP response metrics as one counter family.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_responses_total{direction="incoming"}', + "Parsed output should retain direction labels on response metrics.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_responses_total{status="2xx"}', + "Parsed output should retain status code class labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_cache_ims_total{result="miss"}', + "Parsed output should retain cache result labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_http_transaction_counts_failed_total{result="errors",method="connect"}', + "Parsed output should preserve combined result and method labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_eventloop_count{le="100s"}', + "Parsed output should retain bucket labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_cache_volume_lookup_active{volume="0"}', + "Parsed output should preserve gauge labels.", + ) + p.Streams.stdout += Testers.ContainsExpression( + 'proxy_process_cache_volume_lookup_success_total{volume="0"}', + "Parsed output should preserve counter labels for volume metrics.", + ) + def __testCaseNoAccept(self): tr = Test.AddTestRun('Fetch stats over HTTP in JSON format: no Accept and default path') self.__checkProcessBefore(tr) @@ -127,6 +254,19 @@ def __testCaseAcceptPrometheus(self): tr.Processes.Default.TimeOut = 3 self.__checkProcessAfter(tr) + def __testCaseAcceptPrometheusV2(self): + tr = Test.AddTestRun("Fetch stats over HTTP in Prometheus v2 format via Accept header") + self.__checkProcessBefore(tr) + tr.MakeCurlCommand( + f"-vs -H'Accept: text/plain; version=2.0.0' --http1.1 http://127.0.0.1:{self.ts.Variables.port}/_stats", + ts=self.ts, + ) + tr.Processes.Default.ReturnCode = 0 + self.__checkPrometheusV2Metrics(tr.Processes.Default) + tr.Processes.Default.Streams.stderr = ("gold/stats_over_http_prometheus_v2_accept_stderr.gold") + tr.Processes.Default.TimeOut = 3 + self.__checkProcessAfter(tr) + def __testCasePathJSON(self): tr = Test.AddTestRun('Fetch stats over HTTP in JSON format via /_stats/json') self.__checkProcessBefore(tr) @@ -160,6 +300,19 @@ def __testCasePathPrometheus(self): tr.Processes.Default.TimeOut = 3 self.__checkProcessAfter(tr) + def __testCasePathPrometheusV2(self): + tr = Test.AddTestRun("Fetch stats over HTTP in Prometheus v2 format via /_stats/prometheus_v2") + self.__checkProcessBefore(tr) + tr.MakeCurlCommand( + f"-vs --http1.1 http://127.0.0.1:{self.ts.Variables.port}/_stats/prometheus_v2", + ts=self.ts, + ) + tr.Processes.Default.ReturnCode = 0 + self.__checkPrometheusV2Metrics(tr.Processes.Default) + tr.Processes.Default.Streams.stderr = ("gold/stats_over_http_prometheus_v2_stderr.gold") + tr.Processes.Default.TimeOut = 3 + self.__checkProcessAfter(tr) + def __testCaseAcceptIgnoredIfPathExplicit(self): tr = Test.AddTestRun('Fetch stats over HTTP in Prometheus format with Accept csv header') self.__checkProcessBefore(tr) @@ -184,16 +337,36 @@ def __queryAndParsePrometheusMetrics(self): p.Command = f'{sys.executable} {ingester} http://127.0.0.1:{self.ts.Variables.port}/_stats/prometheus' p.ReturnCode = 0 self.__checkPrometheusMetrics(p, from_prometheus=True) + self.__checkProcessAfter(tr) + + def __queryAndParsePrometheusV2Metrics(self): + """ + Query the ATS stats over HTTP in Prometheus v2 format and parse the output. + """ + tr = Test.AddTestRun('Query and parse Prometheus v2 metrics') + ingester = 'prometheus_stats_ingester.py' + tr.Setup.CopyAs(ingester) + self.__checkProcessBefore(tr) + p = tr.Processes.Default + p.Command = ( + f'{sys.executable} {ingester} --validate-v2-format --strict-family-metadata ' + f'http://127.0.0.1:{self.ts.Variables.port}/_stats/prometheus_v2') + p.ReturnCode = 0 + self.__checkParsedPrometheusV2Metrics(p) + self.__checkProcessAfter(tr) def run(self): self.__testCaseNoAccept() self.__testCaseAcceptCSV() self.__testCaseAcceptPrometheus() + self.__testCaseAcceptPrometheusV2() self.__testCasePathJSON() self.__testCasePathCSV() self.__testCasePathPrometheus() + self.__testCasePathPrometheusV2() self.__testCaseAcceptIgnoredIfPathExplicit() self.__queryAndParsePrometheusMetrics() + self.__queryAndParsePrometheusV2Metrics() StatsOverHttpPluginTest().run()