From 7e941bbb3648b342298867baa14bdea31701df2d Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 14:39:58 +0530 Subject: [PATCH 1/7] feat(helm): remove netobserv and some calico updates for k3s --- modules/helm/calico.tf | 5 +++++ modules/helm/netobserv.tf | 12 ------------ modules/helm/variables.tf | 14 -------------- 3 files changed, 5 insertions(+), 26 deletions(-) delete mode 100644 modules/helm/netobserv.tf diff --git a/modules/helm/calico.tf b/modules/helm/calico.tf index b5b1084..7636b83 100644 --- a/modules/helm/calico.tf +++ b/modules/helm/calico.tf @@ -35,6 +35,11 @@ resource "helm_release" "calico" { { name = "installation.calicoNetwork.mtu" value = "1440" + }, + { + name = "kubeletVolumePluginPath" + value = "None" + type = "string" } ] diff --git a/modules/helm/netobserv.tf b/modules/helm/netobserv.tf deleted file mode 100644 index 2f9081c..0000000 --- a/modules/helm/netobserv.tf +++ /dev/null @@ -1,12 +0,0 @@ -# Netobserv Configuration -resource "helm_release" "netobserv" { - name = var.netobserv_configuration.name - namespace = var.netobserv_configuration.namespace - repository = var.netobserv_configuration.repository - chart = var.netobserv_configuration.chart - version = var.netobserv_configuration.version - create_namespace = var.netobserv_configuration.create_namespace - - depends_on = [ helm_release.calico, helm_release.cert-manager ] - timeout = 1800 -} diff --git a/modules/helm/variables.tf b/modules/helm/variables.tf index b485807..cff16ba 100644 --- a/modules/helm/variables.tf +++ b/modules/helm/variables.tf @@ -66,20 +66,6 @@ variable "calico_configuration" { } } -# --------------- NETOBSERV VARIABLES --------------- # -variable "netobserv_configuration" { - description = "Dictionary filled with Netobserv Operator Configuration Details" - type = map(string) - default = { - "name" = "netobserv" - "namespace" = "netobserv" - "repository" = "https://netobserv.io/static/helm" - "chart" = "netobserv-operator" - "version" = "1.11.0" - "create_namespace" = true - } -} - # --------------- EXTERNAL SECRETS VARIABLES --------------- # variable "external_secrets_configuration" { description = "Dictionary filled with External Secrets Operator Configuration Details" From 06356c9ec4a340814f83ff99972cb262a0d8f930 Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 14:41:01 +0530 Subject: [PATCH 2/7] feat(observability): using calico for network observability --- modules/observability/configmap.tf | 11 + .../observability/network_observability.tf | 154 +++-- modules/observability/otel-collector.tf | 29 +- modules/observability/proto/api.proto | 570 ++++++++++++++++++ 4 files changed, 667 insertions(+), 97 deletions(-) create mode 100644 modules/observability/configmap.tf create mode 100644 modules/observability/proto/api.proto diff --git a/modules/observability/configmap.tf b/modules/observability/configmap.tf new file mode 100644 index 0000000..67251ea --- /dev/null +++ b/modules/observability/configmap.tf @@ -0,0 +1,11 @@ +# ConfigMap for setting up the proto file for querying Goldmane GRPC API +resource "kubernetes_config_map" "goldmane_api_proto" { + metadata { + name = "goldmane-api-proto" + namespace = "calico-system" + } + + data = { + "api.proto" = file("${path.module}/proto/api.proto") + } +} diff --git a/modules/observability/network_observability.tf b/modules/observability/network_observability.tf index de6c665..5e74924 100644 --- a/modules/observability/network_observability.tf +++ b/modules/observability/network_observability.tf @@ -1,97 +1,91 @@ -resource "kubernetes_manifest" "network_observability" { - manifest = { - apiVersion = "flows.netobserv.io/v1beta2" - kind = "FlowCollector" - metadata = { - // The operator expects this specific name - name = "cluster" +# Print out the network flow logs to stdout from Goldmane API +resource "kubernetes_deployment" "goldmane_otel_adapter" { + metadata { + name = "goldmane-otel-adapter" + namespace = "calico-system" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "goldmane-otel-adapter" + } } - spec = { - namespace = kubernetes_namespace.namespace.metadata[0].name - - // "Direct" mode sends logs straight to OTel - deploymentModel = "Direct" - agent = { - type = "eBPF" - ebpf = { - // Required for "PacketDrop" to read kernel drop reasons - privileged = true - - // Enable drop detection and TCP round trips metrics - features = ["PacketDrop", "FlowRTT"] + template { + metadata { + labels = { + app = "goldmane-otel-adapter" + } + } + + spec { + container { + name = "scraper" + image = "alpine:latest" + command = ["/bin/sh", "-c"] - // 25 means 1 in 25 packets - sampling = 25 - cacheActiveTimeout = "15s" - cacheMaxFlows = 100000 + args = [ + <<-EOF + # Install jq and curl + apk add --no-cache jq curl + + # Download and extract the grpcurl binary directly into our path + curl -sL https://github.com/fullstorydev/grpcurl/releases/download/v1.9.3/grpcurl_1.9.3_linux_x86_64.tar.gz | tar -xzf - -C /usr/local/bin grpcurl + + # Run the stream and pipe it to jq + grpcurl -import-path /etc/proto -proto api.proto \ + -cacert /etc/pki/tls/certs/tigera-ca-bundle.crt \ + -cert /goldmane-key-pair/tls.crt \ + -key /goldmane-key-pair/tls.key \ + -d '{"start_time_gte": 0, "aggregation_interval": 5}' \ + goldmane.calico-system.svc.cluster.local:7443 goldmane.Flows/Stream \ + | jq --unbuffered -c '.' + EOF + ] + + volume_mount { + name = "proto-file" + mount_path = "/etc/proto" + read_only = true + } - // Ignore loopback traffic - excludeInterfaces = ["lo"] + volume_mount { + name = "goldmane-ca-bundle" + mount_path = "/etc/pki/tls/certs" + read_only = true + } - // Resource Constraints - resources = { - requests = { - cpu = "50m" - memory = "100Mi" - } - limits = { - cpu = "500m" - memory = "512Mi" - } + volume_mount { + name = "goldmane-key-pair" + mount_path = "/goldmane-key-pair" + read_only = true } } - } - // Disable default stack for the netobserv instance - loki = { - enable = false - } - prometheus = { - querier = { - enable = false + volume { + name = "proto-file" + config_map { + name = kubernetes_config_map.goldmane_api_proto.metadata[0].name + } } - } - consolePlugin = { - enable = false - } - // Enrichment settings - processor = { - logTypes = "Flows" - metrics = { - // Disable agent-side metrics generation to save CPU - disableAlerts = ["NetObservLokiError", "NetObservNoFlows"] + volume { + name = "goldmane-ca-bundle" + config_map { + name = "goldmane-ca-bundle" + } } - } - // Pushing metrics to the OTel Collector - exporters = [ - { - type = "OpenTelemetry" - openTelemetry = { - targetHost = "otel-collector.${kubernetes_namespace.namespace.metadata[0].name}.svc.cluster.local" - targetPort = 4317 - protocol = "grpc" - - logs = { - enable = true - pushTimeInterval = "20s" - expiryTime = "2m" - } - - metrics = { - enable = true - } - tls = { - enable = false - insecureSkipVerify = true - } + volume { + name = "goldmane-key-pair" + secret { + secret_name = "goldmane-key-pair" } } - ] + } } } - - depends_on = [ helm_release.otel_collector ] } diff --git a/modules/observability/otel-collector.tf b/modules/observability/otel-collector.tf index 5e9880a..6fc12b9 100644 --- a/modules/observability/otel-collector.tf +++ b/modules/observability/otel-collector.tf @@ -68,7 +68,7 @@ resource "helm_release" "otel_collector" { } // Scrape Node CPU/RAM/Disk hostMetrics = { - enabled = true + enabled = false } // Scrape Pod CPU/RAM (Kubelet) kubeletMetrics = { @@ -85,6 +85,17 @@ resource "helm_release" "otel_collector" { // Custom Configuration for receivers config = { receivers = { + // Custom Host Metrics receiver configuration + hostmetrics = { + collection_interval = "10s" + scrapers = { + cpu = {} + memory = {} + disk = {} + network = {} + load = {} + } + } // OTLP Endpoints to send stuff to this collector otlp = { protocols = { @@ -206,17 +217,6 @@ resource "helm_release" "otel_collector" { limit_mib = 400 spike_limit_mib = 100 } - // Tag Netobserv logs appropriately - "resource/netobserv" = { - attributes = [ - { - key = "log.source" - value = "netobserv" - action = "insert" - } - ] - } - transform = { // If a metric comes in missing its namespace or pod label, // look at the underlying server/container it came from. @@ -276,11 +276,6 @@ resource "helm_release" "otel_collector" { processors = ["memory_limiter", "batch"] exporters = ["debug"] } - "logs/netobserv" = { - receivers = ["otlp"] - processors = ["memory_limiter", "resource/netobserv", "batch"] - exporters = ["otlphttp"] - } } } } diff --git a/modules/observability/proto/api.proto b/modules/observability/proto/api.proto new file mode 100644 index 0000000..36917fb --- /dev/null +++ b/modules/observability/proto/api.proto @@ -0,0 +1,570 @@ +syntax = "proto3"; + +package goldmane; + +option go_package = "./proto"; + +// Flows provides APIs for querying aggregated Flow data. +// +// The returned Flows will be aggregated across cluster nodes, as well as the specified aggregation +// time interval. +service Flows { + // List is an API call to query for one or more Flows. + rpc List(FlowListRequest) returns (FlowListResult); + + // Stream is an API call to return a long running stream of new Flows as they are generated. + rpc Stream(FlowStreamRequest) returns (stream FlowResult); + + // FilterHints can be used to discover available filter criteria, such as + // Namespaces and source / destination names. It allows progressive filtering of criteria based on + // other filters. i.e., return the flow destinations given a source namespace. + // Note that this API provides hints to the UI based on past flows and other values may be valid. + rpc FilterHints(FilterHintsRequest) returns (FilterHintsResult); +} + +// FlowListRequest defines a message to request a particular selection of aggregated Flow objects. +message FlowListRequest { + // StartTimeGt specifies the beginning of a time window with which to filter Flows. Flows + // will be returned only if their start time is greater than or equal to the given value. + // + // - A value of zero indicates the oldest start time available by the server. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_gte = 1; + + // StartTimeLt specifies the end of a time window with which to filter flows. Flows will + // be returned only if their start time occurs before the requested time. + // + // - A value of zero means "now", as determined by the server at the time of request. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_lt = 2; + + // Page specifies the page to return. It requires that PageSize is also specified in order + // to determine page boundaries. Note that pages may change over time as new flow data is collected or expired. + // Querying the same page at different points in time may return different results. + int64 page = 3; + + // PageSize configures the maximum number of results to return as part of this query. + int64 page_size = 4; + + // SortBy configures how to sort the results of this query. By default flows are sorted by start time. + // The returned list is sorted by each sort option, in order, using the next sort option in the list as a tie-breaker. + // Note: At the moment, only a single sort option is supported. + repeated SortOption sort_by = 5; + + // Filter allows specification of one or more criteria on which to filter the returned Flows. + Filter filter = 6; + + // AggregationInterval is the width of the time window in seconds across which to aggregate when generating + // Flows to return. This must be a multiple of 15. + int64 aggregation_interval = 7; +} + +// FlowListResult is a message containing a list of FlowResults and ListMetadata. +message FlowListResult { + // Meta specifies metadata about the returned flows. + ListMetadata meta = 1; + + // Flows is a list of FlowResult objects. + repeated FlowResult flows = 2; +} + +// FlowStreamRequest defines a message to request a stream of aggregated Flows. +message FlowStreamRequest { + // StartTimeGt specifies the beginning of a time window from which to stream Flows. Flows + // will be streamed only if their start time is greater than or equal to the given value. + // + // - A value of zero means "now", as determined by the server at the time of request. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_gte = 1; + + // Filter allows specification of one or more criteria on which to filter the returned Flows. + Filter filter = 2; + + // AggregationInterval defines both the frequency of streamed updates for each Flow, and the amount of time that FlowResult covers. + // It must always be 15s. + // + // Every AggregationInterval the server must send a FlowResult containing the aggregated data for that Flow from a + // time interval of width AggregationInterval. + // + // For a Flow that has continuous traffic, the server should send updates covering the range + // [now-2*AggregationInterval, now-AggregationInterval] so that the data is reasonably likely to be complete. + int64 aggregation_interval = 3; +} + +message FilterHintsRequest { + // Type is type of Filter to query. + FilterType type = 1; + + // Filter is a set of filter criteria used to narrow down returned results. + Filter filter = 2; + + // StartTimeGt specifies the beginning of a time window with which to filter (inclusive). + // + // - A value of zero indicates the oldest start time available by the server. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_gte = 3; + + // StartTimeLt specifies the end of a time window with which to filter. + // + // - A value of zero means "now", as determined by the server at the time of request. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_lt = 4; + + // Page specifies the page number to return. It requires that PageSize is also specified in order + // to determine page boundaries. Note that pages may change over time as new flow data is collected or expired. + // Querying the same page at different points in time may return different results. + int64 page = 5; + + // PageSize configures the maximum number of results to return as part of this query. + int64 page_size = 6; +} + +message FilterHintsResult { + // ListMetadata specifies list information about the flows returned. + ListMetadata meta = 1; + + // FilterHint contains the values that flows can be filtered on. + repeated FilterHint hints = 2; +} + +// ListMetadata contains information about a returned list of items, such as pagination information (total number of pages +// and total number of results). +message ListMetadata { + // totalPages is the total number of pages that exist given that a pageSize was specified. + int64 totalPages = 1; + + // TotalResults are the total number of results that would have been returned if no pagination was specified. + int64 totalResults = 2; +} + +message FilterHint { + string value = 1; +} + +// FilterType specifies which fields on the underlying Flow data to collect. +enum FilterType { + FilterTypeUnspecified = 0; + FilterTypeDestName = 1; + FilterTypeSourceName = 2; + FilterTypeDestNamespace = 3; + FilterTypeSourceNamespace = 4; + FilterTypePolicyTier = 5; + FilterTypePolicyName = 6; + FilterTypePolicyKind = 7; + FilterTypePolicyNamespace = 8; +} + +// FlowResult wraps a Flow object with additional metadata. +message FlowResult { + // ID is an opaque integer value ID that can be used to identify a Flow, and is 1:1 with the FlowKey. + // Note that this ID is not valid across server restarts. Its primary use-case is for correlating FlowResult + // updates from a Stream request. + int64 id = 1; + + // The Flow object itself. + Flow flow = 2; +} + +enum Action { + ActionUnspecified = 0; + Allow = 1; + Deny = 2; + Pass = 3; +} + +// Filter defines criteria for selecting a set of Flows based on their parameters. +message Filter { + // SourceNames allows filtering on the source name field. Combined using logical OR. + repeated StringMatch source_names = 1; + + // SourceNamespaces filters on the source namespace field. Combined using logical OR. + repeated StringMatch source_namespaces = 2; + + // DestNames filters on the destination name field. Combined using logical OR. + repeated StringMatch dest_names = 3; + + // DestNamespaces filters on the destination namespace field. Combined using logical OR. + repeated StringMatch dest_namespaces = 4; + + // Protocols filters on the protocol field. Combined using logical OR. + repeated StringMatch protocols = 5; + + // DestPorts filters on the port field. Combined using logical OR. + repeated PortMatch dest_ports = 6; + + // Actions filters on the action field. Combined using logical OR. + repeated Action actions = 7; + + // Policies matches on policy fields. Combined using logical OR. + repeated PolicyMatch policies = 8; + + // Reporter filters on the reporter field. + Reporter reporter = 9; + + // Pending/Staged Actions filters on the action field. Combined using logical OR. + repeated Action pending_actions = 10; +} + +enum MatchType { + // Match the value exactly. + Exact = 0; + + // Use fuzzy matching on the value. + Fuzzy = 1; +} + +message StringMatch { + string value = 1; + MatchType type = 2; +} + +message PortMatch { + int64 port = 1; +} + +message SortOption { + // SortBy declares the field by which to sort. + SortBy sort_by = 1; +} + +// PolicyMatch defines criteria for matching one or more policy rules within a Flow's +// policy trace. +message PolicyMatch { + PolicyKind kind = 1; + string tier = 2; + string namespace = 3; + string name = 4; + Action action = 5; +} + +enum PolicyKind { + // Unspecified + KindUnspecified = 0; + + // Calico policy types. + CalicoNetworkPolicy = 1; + GlobalNetworkPolicy = 2; + StagedNetworkPolicy = 3; + StagedGlobalNetworkPolicy = 4; + StagedKubernetesNetworkPolicy = 5; + + // Native Kubernetes types. + NetworkPolicy = 6; + ClusterNetworkPolicy = 7; + + // Calico Profiles. + Profile = 9; + EndOfTier = 10; +} + +enum SortBy { + Time = 0; + DestName = 1; + DestNamespace = 2; + DestType = 3; + SourceName = 4; + SourceNamespace = 5; + SourceType = 6; +} + +// FlowCollector provides APIs capable of receiving streams of Flow data from cluster nodes. +service FlowCollector { + // Connect receives a connection that may stream one or more FlowUpdates. A FlowReceipt is returned + // to the client by the server after each FlowUpdate. + // + // Following a connection or reconnection to the server, clients should duplicates of previously transmitted FlowsUpdates + // in order to allow the server to rebuild its cache, as well as any new FlowUpdates that have not previously been transmitted. + // The server is responsible for deduplicating where needed. + rpc Connect(stream FlowUpdate) returns (stream FlowReceipt); +} + +// FlowReceipt is a response from the server to a client after publishing a Flow. +message FlowReceipt {} + +// FlowUpdate wraps a Flow with additional metadata. +message FlowUpdate { + // Flow contains the actual flow being sent. + Flow flow = 1; +} + +enum EndpointType { + // For queries, unspecified means "do not filter on this field". + EndpointTypeUnspecified = 0; + + // WorkloadEndpoint represents an application endpoint with its own network identity. For example, + // a Kubernetes Pod. + WorkloadEndpoint = 1; + + // HostEndpoint represents a host machine. + HostEndpoint = 2; + + // NetworkSet represents an address from within a configured projectcalico.org/v3 NetworkSet or + // GlobalNetworkSet. + NetworkSet = 3; + + // Network represents an endpoint on a public or private network not known by Calico. For example, + // traffic from the public internet or private LAN not covered by a NetworkSet. + Network = 4; +} + +enum Reporter { + // For queries, unspecified means "do not filter on this field". + ReporterUnspecified = 0; + Src = 1; + Dst = 2; +} + +// FlowKey includes the identifying fields for a Flow. +// - Source: Name, namespace, type, and labels. +// - Destination: Name, namespace, type, labels and port +// - Action taken on the connection. +// - Reporter (i.e., measured at source or destination). +// - Protocol of the connection (TCP, UDP, etc.). +message FlowKey { + // SourceName is the name of the source for this Flow. + // The value is contextualized by the source_type field: + // - For WorkloadEndpoint, this represents a set of pods that share a GenerateName. + // - For HostEndpoint, this is the host endpoint name. + // - For NetworkSet, it is the name of the network set. + // - For Network, this is either "pub" for a public network, or "pvt" for a private network. + string source_name = 1; + + // SourceNamespace is the namespace of the source pods for this flow. + string source_namespace = 2; + + // SourceType is the type of the source, used to contextualize the source + // name and namespace fields. + EndpointType source_type = 3; + + // DestName is the name of the destination for this Flow. + // The value is contextualized by the source_type field: + // - For WorkloadEndpoint, this represents a set of pods that share a GenerateName. + // - For HostEndpoint, this is the host endpoint name. + // - For NetworkSet, it is the name of the network set. + // - For Network, this is either "pub" for a public network, or "pvt" for a private network. + string dest_name = 4; + + // DestNamespace is the namespace of the destination pods for this flow. + string dest_namespace = 5; + + // DestType is the type of the destination, used to contextualize the dest + // name and namespace fields. + EndpointType dest_type = 6; + + // DestPort is the destination port on the specified protocol accessed by this flow. + int64 dest_port = 7; + + // DestServiceName is the name of the destination service, if any. + string dest_service_name = 8; + + // DestServiceNamespace is the namespace of the destination service, if any. + string dest_service_namespace = 9; + + // DestServicePortName is the name of the port on the destination service, if any. + string dest_service_port_name = 10; + + // DestServicePort is the port number on the destination service. + int64 dest_service_port = 11; + + // Proto is the L4 protocol for this flow. For example, TCP, UDP, SCTP, ICMP. + string proto = 12; + + // Reporter is either "src" or "dst", depending on whether this flow was generated + // at the initiating or terminating end of the connection attempt. + Reporter reporter = 13; + + // Action is the ultimate action taken on the flow. + Action action = 14; + + // Policies includes an entry for each policy rule that took an action on the connections + // aggregated into this flow. + PolicyTrace policies = 15; +} + +// Flow is a message representing statistics gathered about connections that share common fields, +// aggregated across either time, nodes, or both. +message Flow { + // Key includes the identifying fields for this flow. + FlowKey Key = 1; + + // StartTime is the start time for this flow. It is represented as the number of + // seconds since the UNIX epoch. + int64 start_time = 2; + + // EndTime is the end time for this flow. It is always at least one aggregation + // interval after the start time. + int64 end_time = 3; + + // SourceLabels contains the intersection of labels that appear on all source + // pods that contributed to this flow. + repeated string source_labels = 4; + + // SourceLabels contains the intersection of labels that appear on all destination + // pods that contributed to this flow. + repeated string dest_labels = 5; + + // Statistics. + int64 packets_in = 6; + int64 packets_out = 7; + int64 bytes_in = 8; + int64 bytes_out = 9; + + // NumConnectionsStarted tracks the total number of new connections recorded for this Flow. It counts each + // connection attempt that matches the FlowKey that was made between this Flow's StartTime and EndTime. + int64 num_connections_started = 10; + + // NumConnectionsCompleted tracks the total number of completed TCP connections recorded for this Flow. It counts each + // connection that matches the FlowKey that was completed between this Flow's StartTime and EndTime. + int64 num_connections_completed = 11; + + // NumConnectionsLive tracks the total number of still active connections recorded for this Flow. It counts each + // connection that matches the FlowKey that was active at this Flow's EndTime. + int64 num_connections_live = 12; +} + +message PolicyTrace { + // EnforcedPolicies shows the active dataplane policy rules traversed by this Flow. + repeated PolicyHit enforced_policies = 1; + + // PendingPolicies shows the expected policy rules traversed by this Flow when including + // staged policies. + repeated PolicyHit pending_policies = 2; +} + +// PolicyHit represents a policy rule that was traversed by this flow. It can be either an enforced policy hit +// from the dataplane, or a staged policy hit that is not yet active. +message PolicyHit { + // Kind corresponds to the resource Kind for the policy. + PolicyKind kind = 1; + + // Namespace is the Kubernetes namespace of the Policy, if namespaced. It is empty for global / + // cluster-scoped policy kinds. + string namespace = 2; + + // Name is the Name of the policy object. + string name = 3; + + // Tier is the Tier of the policy object. + string tier = 4; + + // Action is the action taken by this policy rule. + Action action = 5; + + // PolicyIndex is the order of the Policy among all policies traversed. + int64 policy_index = 6; + + // RuleIndex is the order of the Rule within the Policy rules. + int64 rule_index = 7; + + // Trigger indicates the first policy that selected this Flow and thus triggered the tier's + // end-of-tier action. This is only valid for kind=EndOfTier, and is nil otherwise. + PolicyHit trigger = 8; +} + +// Statistics provides APIs for retrieving Flow statistics. +service Statistics { + // List returns statistics data for the given request. One StatisticsResult will be returned for + // each matching PolicyHit and direction over the timeframe, containing time-series data covering the + // provided time range. + rpc List(StatisticsRequest) returns (stream StatisticsResult); +} + +// StatisticType represents the types of data available over the Statistics API endpoint. +enum StatisticType { + PacketCount = 0; + ByteCount = 1; + LiveConnectionCount = 2; +} + +enum StatisticsGroupBy { + // Policy configures statistics groupings on a per-policy basis. + Policy = 0; + + // PolicyRule configures statistics groupings on a per-policy-rule basis. + PolicyRule = 1; +} + +message StatisticsRequest { + // The start time from which to collect statistics (inclusive). + // + // - A value of zero indicates the oldest start time available by the server. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_gte = 1; + + // The end time indicates the end of the windows from which to collect statistics. + // + // - A value of zero means "now", as determined by the server at the time of request. + // - A value greater than zero indicates an absolute time in seconds since the Unix epoch. + // - A value less than zero indicates a relative number of seconds from "now", as determined by the server. + int64 start_time_lt = 2; + + // Type is the type of statistic to return. e.g., packets, bytes, etc. + StatisticType type = 3; + + // Configure statistics aggregation. + // - Policy: each StatisticsResult will contain statistics for a particular policy. + // - PolicyRule: each StatisticsResult will contain statistics for a particular policy rule. + // - Any: return both per-Policy and per-PolicyRule results. + StatisticsGroupBy group_by = 4; + + // Optionally configure fields to filter results. If provided, any policies not matching the PolicyMatch + // will be omitted from the results. + PolicyMatch policy_match = 5; + + // TimeSeries configures whether or not to return time-series data in the response. If true, + // the response will include multiple datapoints over the given time window. If false, data + // across the time window will be aggregated into a single data point. + bool time_series = 6; +} + +enum RuleDirection { + Any = 0; + Ingress = 1; + Egress = 2; +} + +message StatisticsResult { + // Policy identifies the policy / rule for which this data applies. Its meaning is contextualized + // by the GroupBy field. + // + // - StatisticsGroupBy_Policy: this field represents the specific Policy, and statistics are aggregated across all + // rules within that policy. Rule identifiers (Action, RuleID) will be omitted. + // + // - StatisticsGroupBy_PolicyRule: this field identifies a specific rule within a Policy, and statistics are scoped to + // that particular rule. + PolicyHit policy = 1; + + // For statistics results targeting a specific policy rule, the direction + // contextualizes the rule ID as either an ingress or egress rule. + // + // For statistics results grouped by policy, both ingress and egress statistics will be included. + RuleDirection direction = 2; + + // GroupBy indicates whether the statistics in this result are aggregated for a policy, or for + // a specific rule within that policy. + StatisticsGroupBy group_by = 3; + + // Type indicates the type of data carried in this result. e.g., PacketCount vs ByteCount. + StatisticType type = 4; + + // AllowedIn contains the count of the requested statistic that was allowed for ingress flows. + // The semantic meaning (e.g., packets vs bytes) is indicated by the Type field. + repeated int64 allowed_in = 5; + repeated int64 allowed_out = 6; + + repeated int64 denied_in = 7; + repeated int64 denied_out = 8; + + repeated int64 passed_in = 9; + repeated int64 passed_out = 10; + + // X is the x axis of the data for time-series data. i.e., the timestamp. For non-timeseries data, + // this will be nil. + repeated int64 x = 11; +} From 0b4c1ecdb67b1c84a8ebec58272f20e9460ac6dd Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 14:41:21 +0530 Subject: [PATCH 3/7] feat(observability): dashboard fixes for network observability --- modules/observability/dashboards/cluster.json | 12 +- modules/observability/dashboards/network.json | 640 +++--------------- 2 files changed, 100 insertions(+), 552 deletions(-) diff --git a/modules/observability/dashboards/cluster.json b/modules/observability/dashboards/cluster.json index bb93e0a..ea03265 100644 --- a/modules/observability/dashboards/cluster.json +++ b/modules/observability/dashboards/cluster.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 0, + "id": 9, "links": [], "panels": [ { @@ -415,7 +415,7 @@ "uid": "P4169E866C3094E38" }, "editorMode": "code", - "expr": "sum(container_fs_usage_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",instance=~\"^$Node$\"}) / sum(container_fs_limit_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",instance=~\"^$Node$\"}) * 100", + "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$Node$\"}) / sum(container_fs_limit_bytes{id=\"/\",instance=~\"^$Node$\"}) * 100", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -780,7 +780,7 @@ } ] }, - "unit": "bytes" + "unit": "decbytes" }, "overrides": [] }, @@ -817,7 +817,7 @@ "uid": "P4169E866C3094E38" }, "editorMode": "code", - "expr": "sum(container_fs_usage_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",instance=~\"^$Node$\"})", + "expr": "max(container_fs_usage_bytes{device=~\"^/dev/([sv]d[a-z][1-9]|nvme[0-9]n[0-9]p[0-9])$\",id=\"/\",instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "range": true, @@ -897,7 +897,7 @@ "uid": "P4169E866C3094E38" }, "editorMode": "code", - "expr": "sum(container_fs_limit_bytes{device=~\"^/dev/[sv]d[a-z][1-9]$\",id=\"/\",instance=~\"^$Node$\"})", + "expr": "max(container_fs_limit_bytes{device=~\"^/dev/([sv]d[a-z][1-9]|nvme[0-9]n[0-9]p[0-9])$\",id=\"/\",instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "range": true, @@ -2123,5 +2123,5 @@ "timezone": "browser", "title": "Kubernetes Cluster Level Monitoring", "uid": "c98856d7-3f9c-4ab2-bf54-11fd6c046ef1", - "version": 4 + "version": 1 } \ No newline at end of file diff --git a/modules/observability/dashboards/network.json b/modules/observability/dashboards/network.json index a425d62..a3817d5 100644 --- a/modules/observability/dashboards/network.json +++ b/modules/observability/dashboards/network.json @@ -36,8 +36,8 @@ }, { "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, "fieldConfig": { "defaults": { @@ -51,13 +51,13 @@ } ] }, - "unit": "short" + "unit": "bytes" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 6, + "w": 13, "x": 0, "y": 1 }, @@ -82,78 +82,23 @@ "pluginVersion": "12.3.3", "targets": [ { - "editorMode": "code", - "expr": "sum(rate(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\"}[5m]))", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Total Cluster Egress Rate", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - } - ] + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 1 - }, - "id": 5, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.3.3", - "targets": [ - { "editorMode": "code", - "expr": "sum(rate(netobserv_workload_ingress_bytes_total{DstK8S_Namespace=~\"$DstNamespace\"}[5m]))", - "legendFormat": "__auto", - "range": true, + "expr": "service.name:=\"goldmane-otel-adapter\"\n| unpack_json\n| filter flow.Key.reporter:=\"Src\" AND flow.Key.sourceNamespace:~\"^$SrcNamespace$\"\n| stats sum(flow.bytesOut) as Total_Egress_Bytes", + "queryType": "statsRange", "refId": "A" } ], - "title": "Total Cluster Ingress Rate", + "title": "Total Cluster Egress", "type": "stat" }, { "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, "fieldConfig": { "defaults": { @@ -173,11 +118,11 @@ }, "gridPos": { "h": 8, - "w": 6, - "x": 12, + "w": 11, + "x": 13, "y": 1 }, - "id": 6, + "id": 5, "options": { "colorMode": "background", "graphMode": "none", @@ -198,72 +143,17 @@ "pluginVersion": "12.3.3", "targets": [ { - "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(netobserv_workload_rtt_seconds_bucket{SrcK8S_Namespace=~\"$SrcNamespace\"}[5m])) by (le))", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Global TCP Latency (P95)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - } - ] + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 7, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.3.3", - "targets": [ - { "editorMode": "code", - "expr": "sum(rate(netobserv_workload_drop_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\"}[5m]))", - "legendFormat": "__auto", - "range": true, + "expr": "service.name:=\"goldmane-otel-adapter\"\n| unpack_json\n| filter flow.Key.reporter:=\"Dst\" AND flow.Key.destNamespace:~\"^$DstNamespace$\"\n| stats sum(flow.bytesIn) as Total_Ingress_Bytes", + "queryType": "statsRange", "refId": "A" } ], - "title": "Active Network Drops", + "title": "Total Cluster Ingress", "type": "stat" }, { @@ -274,74 +164,6 @@ "x": 0, "y": 9 }, - "id": 2, - "panels": [], - "title": "Visual Network Topology", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" - }, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 23, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 1, - "options": { - "edges": {}, - "layoutAlgorithm": "force", - "nodes": {}, - "zoomMode": "greedy" - }, - "pluginVersion": "12.3.3", - "targets": [ - { - "editorMode": "code", - "exemplar": false, - "expr": "label_join(\n label_replace(\n label_replace(\n sum by (SrcK8S_Namespace, DstK8S_Namespace) (rate(netobserv_workload_flows_total{SrcK8S_Namespace!=\"\", DstK8S_Namespace!=\"\"}[5m])) > 0,\n \"target\", \"$1\", \"DstK8S_Namespace\", \"(.*)\"\n ),\n \"source\", \"$1\", \"SrcK8S_Namespace\", \"(.*)\"\n ),\n \"id\", \"-\", \"source\", \"target\"\n)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "Dynamic Namespace Topology Map", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "DstK8S_Namespace": true, - "SrcK8S_Namespace": true, - "Time": true - }, - "includeByName": {}, - "indexByName": {}, - "renameByName": { - "Value": "mainStat" - } - } - } - ], - "type": "nodeGraph" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, "id": 8, "panels": [], "title": "Cross-Namespace Top Talkers", @@ -349,8 +171,8 @@ }, { "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, "fieldConfig": { "defaults": { @@ -382,7 +204,7 @@ }, "showPoints": "auto", "showValues": false, - "spanNulls": false, + "spanNulls": true, "stacking": { "group": "A", "mode": "none" @@ -398,10 +220,6 @@ { "color": "green", "value": 0 - }, - { - "color": "red", - "value": 80 } ] } @@ -409,10 +227,10 @@ "overrides": [] }, "gridPos": { - "h": 8, - "w": 12, + "h": 14, + "w": 24, "x": 0, - "y": 34 + "y": 10 }, "id": 9, "options": { @@ -431,110 +249,18 @@ "pluginVersion": "12.3.3", "targets": [ { - "editorMode": "code", - "expr": "topk(10, sum by (SrcK8S_Namespace, DstK8S_Namespace) (rate(netobserv_workload_flows_total{SrcK8S_Namespace=~\"$SrcNamespace\", DstK8S_Namespace=~\"$DstNamespace\"}[5m])))", - "legendFormat": "{{SrcK8S_Namespace}} -> {{DstK8S_Namespace}}", - "range": true, - "refId": "A" - } - ], - "title": "Top 10 Cross-Namespace Flows", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "shades" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "showValues": false, - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 34 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "12.3.3", - "targets": [ - { "editorMode": "code", - "expr": "topk(10, sum by (SrcK8S_OwnerName) (rate(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\", DstK8S_Namespace=\"\"}[5m])))", - "legendFormat": "{{SrcK8S_OwnerName}} -> External", - "range": true, + "expr": "service.name:=\"goldmane-otel-adapter\"\n| unpack_json\n| filter flow.Key.reporter:=\"Src\" \n| stats by (_time: 1m, flow.Key.sourceNamespace, flow.Key.destNamespace) sum(flow.bytesOut) as Bytes\n| sort by (Bytes) desc \n| limit 10", + "legendFormat": "{{ flow.Key.sourceNamespace }} -> {{ flow.Key.destNamespace }}", + "queryType": "statsRange", "refId": "A" } ], - "title": "Top External Talkers", + "title": "Top 10 Cross-Namespace Flows", "type": "timeseries" }, { @@ -543,7 +269,7 @@ "h": 1, "w": 24, "x": 0, - "y": 42 + "y": 24 }, "id": 11, "panels": [], @@ -552,8 +278,8 @@ }, { "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, "fieldConfig": { "defaults": { @@ -585,7 +311,7 @@ }, "showPoints": "auto", "showValues": false, - "spanNulls": false, + "spanNulls": true, "stacking": { "group": "A", "mode": "none" @@ -616,7 +342,7 @@ "h": 8, "w": 12, "x": 0, - "y": 43 + "y": 25 }, "id": 12, "options": { @@ -635,10 +361,14 @@ "pluginVersion": "12.3.3", "targets": [ { + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" + }, "editorMode": "code", - "expr": "topk(10, sum by (SrcK8S_OwnerName) (rate(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\", SrcK8S_OwnerName=~\"$Pod\"}[5m])))", - "legendFormat": "__auto", - "range": true, + "expr": "service.name:=\"goldmane-otel-adapter\"\n| unpack_json\n| filter flow.Key.reporter:=\"Src\" AND flow.Key.sourceNamespace:~\"^$SrcNamespace$\"\n| stats by (_time: 1m, flow.Key.sourceName) sum(flow.bytesOut) as Egress_Bytes\n| sort by (Egress_Bytes) desc \n| limit 10", + "legendFormat": "{{ flow.Key.sourceName }}", + "queryType": "statsRange", "refId": "A" } ], @@ -647,8 +377,8 @@ }, { "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, "fieldConfig": { "defaults": { @@ -680,7 +410,7 @@ }, "showPoints": "auto", "showValues": false, - "spanNulls": false, + "spanNulls": true, "stacking": { "group": "A", "mode": "none" @@ -707,7 +437,7 @@ "h": 8, "w": 12, "x": 12, - "y": 43 + "y": 25 }, "id": 13, "options": { @@ -726,218 +456,18 @@ "pluginVersion": "12.3.3", "targets": [ { - "editorMode": "code", - "expr": "topk(10, sum by (DstK8S_OwnerName) (rate(netobserv_workload_ingress_bytes_total{DstK8S_Namespace=~\"$DstNamespace\", DstK8S_OwnerName=~\"$Pod\"}[5m])))", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Top 10 Pods by Ingress", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 51 - }, - "id": 14, - "panels": [], - "title": "TCP Health & Network Drops", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "showValues": false, - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 52 - }, - "id": 15, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "12.3.3", - "targets": [ - { - "editorMode": "code", - "expr": "histogram_quantile(0.95, sum by (le, SrcK8S_OwnerName) (rate(netobserv_workload_rtt_seconds_bucket{SrcK8S_Namespace=~\"$SrcNamespace\", SrcK8S_OwnerName=~\"$Pod\"}[5m])))", - "legendFormat": "{{ SrcK8S_OwnerName }}", - "range": true, - "refId": "A" - } - ], - "title": "TCP Latency (RTT) by Pod", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P4169E866C3094E38" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "shades" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "showValues": false, - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 80 - } - ] + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 52 - }, - "id": 16, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "12.3.3", - "targets": [ - { "editorMode": "code", - "expr": "sum by (SrcK8S_OwnerName) (rate(netobserv_workload_drop_bytes_total{SrcK8S_Namespace=~\"$SrcNamespace\", SrcK8S_OwnerName=~\"$Pod\"}[5m]))", - "legendFormat": "Dropped: {{SrcK8S_OwnerName}}", - "range": true, + "expr": "service.name:=\"goldmane-otel-adapter\"\n| unpack_json\n| filter flow.Key.reporter:=\"Dst\" AND flow.Key.destNamespace:~\"^$DstNamespace$\"\n| stats by (_time: 1m, flow.Key.destName) sum(flow.bytesIn) as Ingress_Bytes\n| sort by (Ingress_Bytes) desc \n| limit 10", + "legendFormat": "{{ flow.Key.destName }}", + "queryType": "statsRange", "refId": "A" } ], - "title": "Dropped Traffic by Pod", + "title": "Top 10 Pods by Ingress", "type": "timeseries" }, { @@ -946,7 +476,7 @@ "h": 1, "w": 24, "x": 0, - "y": 60 + "y": 33 }, "id": 17, "panels": [], @@ -966,18 +496,20 @@ "h": 24, "w": 24, "x": 0, - "y": 61 + "y": 34 }, "id": 18, "options": { "dedupStrategy": "none", - "enableInfiniteScrolling": false, + "detailsMode": "inline", + "enableInfiniteScrolling": true, "enableLogDetails": true, "prettifyLogMessage": true, "showControls": true, "showLabels": true, "showTime": true, "sortOrder": "Descending", + "syntaxHighlighting": true, "wrapLogMessage": true }, "pluginVersion": "12.3.3", @@ -989,12 +521,12 @@ }, "direction": "desc", "editorMode": "code", - "expr": "source.k8s.namespace.name: \"$SrcNamespace\"", + "expr": "service.name:=\"goldmane-otel-adapter\"\n| unpack_json\n| filter flow.Key.sourceNamespace:~\"^$SrcNamespace$\" OR flow.Key.destNamespace:~\"^$SrcNamespace$\"", "queryType": "instant", "refId": "A" } ], - "title": "Real-Time eBPF Network Logs", + "title": "Real-Time Network Logs", "type": "logs" } ], @@ -1013,16 +545,21 @@ "postgres" ] }, - "definition": "label_values(netobserv_workload_egress_bytes_total,SrcK8S_Namespace)", + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" + }, + "definition": "service.name:=\"goldmane-otel-adapter\" | unpack_json | stats by (flow.Key.sourceNamespace) count() | keep flow.Key.sourceNamespace", "includeAll": true, "label": "Source Namespace", - "multi": true, "name": "SrcNamespace", "options": [], "query": { - "qryType": 1, - "query": "label_values(netobserv_workload_egress_bytes_total,SrcK8S_Namespace)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" + "field": "flow.Key.sourceNamespace", + "limit": 25, + "query": "service.name:=\"goldmane-otel-adapter\" | unpack_json | stats by (flow.Key.sourceNamespace) count() | keep flow.Key.sourceNamespace", + "refId": "VictoriaLogsVariableQueryEditor-VariableQuery", + "type": "fieldValue" }, "refresh": 1, "regex": "", @@ -1037,16 +574,21 @@ "garage" ] }, - "definition": "label_values(netobserv_workload_ingress_bytes_total,DstK8S_Namespace)", + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" + }, + "definition": "service.name:=\"goldmane-otel-adapter\" | unpack_json | stats by (flow.Key.destNamespace) count() | keep flow.Key.destNamespace", "includeAll": true, "label": "Destination Namespace", - "multi": true, "name": "DstNamespace", "options": [], "query": { - "qryType": 1, - "query": "label_values(netobserv_workload_ingress_bytes_total,DstK8S_Namespace)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" + "field": "flow.Key.destNamespace", + "limit": 25, + "query": "service.name:=\"goldmane-otel-adapter\" | unpack_json | stats by (flow.Key.destNamespace) count() | keep flow.Key.destNamespace", + "refId": "VictoriaLogsVariableQueryEditor-VariableQuery", + "type": "fieldValue" }, "refresh": 1, "regex": "", @@ -1061,16 +603,22 @@ "postgresql-cluster" ] }, - "definition": "label_values(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=\"$SrcNamespace\"},SrcK8S_OwnerName)", + "datasource": { + "type": "victoriametrics-logs-datasource", + "uid": "PD775F2863313E6C7" + }, + "definition": "service.name:=\"goldmane-otel-adapter\" | unpack_json | filter flow.Key.sourceNamespace:=\"$SrcNamespace\" | stats by (flow.Key.sourceName) count() | keep flow.Key.sourceName", "includeAll": true, "label": "Pod Name", "multi": true, "name": "Pod", "options": [], "query": { - "qryType": 1, - "query": "label_values(netobserv_workload_egress_bytes_total{SrcK8S_Namespace=\"$SrcNamespace\"},SrcK8S_OwnerName)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" + "field": "flow.Key.sourceName", + "limit": 1000, + "query": "service.name:=\"goldmane-otel-adapter\" | unpack_json | filter flow.Key.sourceNamespace:=\"$SrcNamespace\" | stats by (flow.Key.sourceName) count() | keep flow.Key.sourceName", + "refId": "VictoriaLogsVariableQueryEditor-VariableQuery", + "type": "fieldValue" }, "refresh": 1, "regex": "", From 2f7b3f1a3ea731b9f612e4e4ae2af6fc60f31ba7 Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 14:43:52 +0530 Subject: [PATCH 4/7] feat(keycloak): resource linkages --- modules/keycloak/deployment.tf | 4 ++-- modules/keycloak/ingress.tf | 2 +- modules/keycloak/networkpolicy.tf | 2 +- modules/keycloak/service.tf | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/keycloak/deployment.tf b/modules/keycloak/deployment.tf index 651d622..5323a6c 100644 --- a/modules/keycloak/deployment.tf +++ b/modules/keycloak/deployment.tf @@ -2,7 +2,7 @@ resource "kubernetes_stateful_set" "keycloak_cluster" { metadata { name = "keycloak-cluster" - namespace = var.namespace + namespace = kubernetes_namespace.namespace.metadata[0].name labels = { app = "keycloak" component = "statefulset" @@ -238,7 +238,7 @@ resource "kubernetes_stateful_set" "keycloak_cluster" { limits = { "cpu" = "500m" - "memory" = "1Gi" + "memory" = "2Gi" } } diff --git a/modules/keycloak/ingress.tf b/modules/keycloak/ingress.tf index a817d27..2db278e 100644 --- a/modules/keycloak/ingress.tf +++ b/modules/keycloak/ingress.tf @@ -2,7 +2,7 @@ resource "kubernetes_ingress_v1" "ingress" { metadata { name = "ingress" - namespace = var.namespace + namespace = kubernetes_namespace.namespace.metadata[0].name labels = { app = var.app_name component = "ingress" diff --git a/modules/keycloak/networkpolicy.tf b/modules/keycloak/networkpolicy.tf index 35e4fd1..c55fb83 100644 --- a/modules/keycloak/networkpolicy.tf +++ b/modules/keycloak/networkpolicy.tf @@ -2,7 +2,7 @@ resource "kubernetes_network_policy" "keycloak_network_access_policy" { metadata { name = "keycloak-network-access-policy" - namespace = var.namespace + namespace = kubernetes_namespace.namespace.metadata[0].name } spec { policy_types = ["Ingress", "Egress"] diff --git a/modules/keycloak/service.tf b/modules/keycloak/service.tf index 381be97..e39a133 100644 --- a/modules/keycloak/service.tf +++ b/modules/keycloak/service.tf @@ -2,7 +2,7 @@ resource "kubernetes_service" "keycloak_discovery" { metadata { name = "keycloak-discovery" - namespace = var.namespace + namespace = kubernetes_namespace.namespace.metadata[0].name } spec { @@ -25,7 +25,7 @@ resource "kubernetes_service" "keycloak_discovery" { resource "kubernetes_service" "keycloak_service" { metadata { name = "keycloak-cluster-service" - namespace = var.namespace + namespace = kubernetes_namespace.namespace.metadata[0].name } spec { From 10718503c2bdd69ff5e273bc0672b1ff29204491 Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 14:44:05 +0530 Subject: [PATCH 5/7] docs(helm): README update --- modules/helm/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/helm/README.md b/modules/helm/README.md index 009358a..470ae2b 100644 --- a/modules/helm/README.md +++ b/modules/helm/README.md @@ -5,7 +5,7 @@ OpenTofu Module to deploy the following required helm charts: 2. [Cloudnative PG (including Barman Plugin)](https://cloudnative-pg.io/) 3. [Traefik](https://traefik.io/) 4. [Calico CNI](https://www.tigera.io/project-calico/) -5. [NetObserv](https://github.com/netobserv) +5. [External Secrets](https://external-secrets.io) ## Providers @@ -22,7 +22,6 @@ OpenTofu Module to deploy the following required helm charts: | [helm_release.cnpg](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.cnpg_barman_plugin](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.external_secrets](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | -| [helm_release.netobserv](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.traefik](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | ## Inputs @@ -34,7 +33,6 @@ OpenTofu Module to deploy the following required helm charts: | [cnpg\_barman\_configuration](#input\_cnpg\_barman\_configuration) | Dictionary filled with Cloud Native PG Barman Configuration Details | `map(string)` |
{
"chart": "plugin-barman-cloud",
"name": "cnpg-barman",
"namespace": "cnpg-system",
"repository": "https://cloudnative-pg.github.io/charts",
"version": "v0.2.0"
}
| no | | [cnpg\_configuration](#input\_cnpg\_configuration) | Dictionary filled with Cloud Native PG Operator Configuration Details | `map(string)` |
{
"chart": "cloudnative-pg",
"create_namespace": true,
"name": "cnpg",
"namespace": "cnpg-system",
"repository": "https://cloudnative-pg.github.io/charts",
"version": "v0.26.0"
}
| no | | [external\_secrets\_configuration](#input\_external\_secrets\_configuration) | Dictionary filled with External Secrets Operator Configuration Details | `map(string)` |
{
"chart": "external-secrets",
"create_namespace": true,
"name": "external-secrets",
"namespace": "external-secrets",
"repository": "https://charts.external-secrets.io",
"version": "2.1.0"
}
| no | -| [netobserv\_configuration](#input\_netobserv\_configuration) | Dictionary filled with Netobserv Operator Configuration Details | `map(string)` |
{
"chart": "netobserv-operator",
"create_namespace": true,
"name": "netobserv",
"namespace": "netobserv",
"repository": "https://netobserv.io/static/helm",
"version": "1.11.0"
}
| no | | [server\_node\_selector](#input\_server\_node\_selector) | Node Selector Label Value to be used for deploying required foundation components | `string` | n/a | yes | | [traefik\_configuration](#input\_traefik\_configuration) | Dictionary filled with Traefik Controller Configuration Details | `map(string)` |
{
"chart": "traefik",
"create_namespace": "true",
"name": "traefik",
"namespace": "traefik",
"repository": "https://traefik.github.io/charts",
"version": "v39.0.7"
}
| no | From 2be933fe15ae3fe0f475e6c4e30bce7e9b6d12ef Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 14:44:14 +0530 Subject: [PATCH 6/7] docs(observability): README update --- modules/observability/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/observability/README.md b/modules/observability/README.md index 9914221..879a901 100644 --- a/modules/observability/README.md +++ b/modules/observability/README.md @@ -7,8 +7,7 @@ These components are being deployed as part of the Observability Module: 2. [VictoriaLogs](https://victoriametrics.com/products/victorialogs/) for logs storage generated from the cluster. 3. [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) for receiving and processing and export telemetry data to the storage databases. 4. [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) to generate and expose cluster-level metrics. -5. [NetObserv](https://github.com/netobserv) for components related to Network Observability in Kubernetes based on eBPF. -6. [Grafana](https://grafana.com/oss/grafana/?plcmt=oss-nav) for the visual layer for observability. +5. [Grafana](https://grafana.com/oss/grafana/?plcmt=oss-nav) for the visual layer for observability. ## Providers @@ -26,6 +25,8 @@ These components are being deployed as part of the Observability Module: | [helm_release.logs](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.metrics](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.otel_collector](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | +| [kubernetes_config_map.goldmane_api_proto](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | +| [kubernetes_deployment.goldmane_otel_adapter](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/deployment) | resource | | [kubernetes_ingress_v1.ingress](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/ingress_v1) | resource | | [kubernetes_manifest.certificate_authority](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | | [kubernetes_manifest.grafana_credentials_sync](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | @@ -34,7 +35,6 @@ These components are being deployed as part of the Observability Module: | [kubernetes_manifest.issuer](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | | [kubernetes_manifest.middleware_buffering](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | | [kubernetes_manifest.middleware_rewrite](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | -| [kubernetes_manifest.network_observability](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | | [kubernetes_manifest.password_generator](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | | [kubernetes_manifest.public_issuer](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | | [kubernetes_manifest.push_grafana_credentials](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/manifest) | resource | From cb9bc3a0eccce6f22439934d9ba1b36a5f34e0b8 Mon Sep 17 00:00:00 2001 From: khatrivarun Date: Fri, 1 May 2026 15:00:00 +0530 Subject: [PATCH 7/7] ci(releases): few fixes for releases --- .github/workflows/releases.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 75fff78..b6e634d 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -29,7 +29,7 @@ jobs: MODULE: ${{ inputs.module_name }} RELEASE_TYPE: ${{ inputs.release_type }} IS_DEV_RELEASE: ${{ inputs.is_dev_release }} - CURRENT_FULL_VERSION: ${{ vars.FULL_VERSION || '0.0.0' }} + CURRENT_FULL_VERSION: ${{ vars.MODULE_VERSION || '0.0.0' }} outputs: version: ${{ steps.version.outputs.NEXT_RELEASE_VERSION }} changelog: ${{ steps.changelog.outputs.NOTES }} @@ -166,15 +166,17 @@ jobs: echo "EOF" >> $GITHUB_OUTPUT - name: Printing out the release information as a step summary for validation + env: + CHANGELOG_NOTES: ${{ steps.changelog.outputs.NOTES }} run: | echo "### Release Preview: ${{ inputs.module_name }}" >> $GITHUB_STEP_SUMMARY echo "**New Version:** \`${{ steps.version.outputs.NEXT_RELEASE_VERSION }}\`" >> $GITHUB_STEP_SUMMARY echo "**Release Type:** ${{ inputs.is_dev_release && 'In Development (Dev)' || 'Stable' }}" >> $GITHUB_STEP_SUMMARY echo "#### Proposed Changelogs:" >> $GITHUB_STEP_SUMMARY - if [ -z "${{ steps.changelog.outputs.NOTES }}" ]; then + if [ -z "$CHANGELOG_NOTES" ]; then echo "*No descriptive changes found (commits may lack the 'Implemented Changes' section).*" >> $GITHUB_STEP_SUMMARY else - echo "${{ steps.changelog.outputs.NOTES }}" >> $GITHUB_STEP_SUMMARY + echo "$CHANGELOG_NOTES" >> $GITHUB_STEP_SUMMARY fi echo "" >> $GITHUB_STEP_SUMMARY echo "> **Note:** Please review the details above. If correct, approve the next job to finalize the release." >> $GITHUB_STEP_SUMMARY