From 696cbb7e1b6b4f24803b7a3e61ded3415a1ee5e3 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sun, 8 Feb 2026 21:10:02 +0100 Subject: [PATCH 01/17] [#158]: moved map manager from identity to common crate to reuse the function in metrics service --- core/common/Cargo.toml | 1 + core/common/src/map_handlers.rs | 54 ++++++++++++++++++++ core/src/components/identity/src/main.rs | 57 ++++++---------------- core/src/components/metrics/src/helpers.rs | 54 +++++++++----------- 4 files changed, 91 insertions(+), 75 deletions(-) diff --git a/core/common/Cargo.toml b/core/common/Cargo.toml index b8e840d..f604b65 100644 --- a/core/common/Cargo.toml +++ b/core/common/Cargo.toml @@ -29,5 +29,6 @@ bytemuck_derive = "1.10.2" map-handlers = [] program-handlers = [] network-structs = [] +monitoring-structs = [] buffer-reader = [] experimental = [] diff --git a/core/common/src/map_handlers.rs b/core/common/src/map_handlers.rs index 19d4e20..313f73e 100644 --- a/core/common/src/map_handlers.rs +++ b/core/common/src/map_handlers.rs @@ -154,3 +154,57 @@ pub fn load_perf_event_array_from_mapdata( })?; Ok(perf_event_array) } + +#[cfg(feature = "map-handlers")] +pub fn map_manager( + maps: BpfMapsData, +) -> Result< + std::collections::HashMap< + String, + ( + aya::maps::PerfEventArray, + Vec>, + ), + >, + Error, +> { + use aya::maps::PerfEventArray; + use aya::maps::{MapData, perf::PerfEventArrayBuffer}; + use tracing::debug; + + let mut map_manager = std::collections::HashMap::< + String, // this will store the bpf map name + (PerfEventArray, Vec>), // this will manage the BPF_MAP_TYPE_PERF_EVENT_ARRAY and its buffer + >::new(); + + // map_manager creates an hashmap that contains: + // MAP NAME as String (KEY) + // + // VALUES (tuple) + // a PERF_EVENT_ARRAY + // a vector of PERF_EVENT_ARRAY_BUFFER + // + // the map manager helps the event listener to specifically call a map by its pinned name + // e.g. veth_identity_map and returns the associated PERF_EVENT_ARRAY and PERF_EVENT_ARRAY_BUFFERS (1 per CPU) + // also the map manager helps to write a more complete debug context by linking map names with arrays and buffers. + // actually i cannot return the extact information using only the Aya library + + // create the PerfEventArrays and the buffers from the BpfMapsData Objects + for (map, name) in maps + .bpf_obj_map + .into_iter() + .zip(maps.bpf_obj_names.into_iter()) + // zip two iterators at the same time for map object and map names + { + debug!("Debugging map type:{:?} for map name {:?}", map, &name); + info!("Creating PerfEventArray for map name {:?}", &name); + + // save the map in a registry if is a PerfEventArray to access them by name + if let std::result::Result::Ok(perf_event_array) = PerfEventArray::try_from(map) { + map_manager.insert(name.clone(), (perf_event_array, Vec::new())); + } else { + warn!("Map {:?} is not a PerfEventArray, skipping load", &name); + } + } + Ok(map_manager) +} diff --git a/core/src/components/identity/src/main.rs b/core/src/components/identity/src/main.rs index 598b964..c70011e 100644 --- a/core/src/components/identity/src/main.rs +++ b/core/src/components/identity/src/main.rs @@ -14,10 +14,6 @@ mod service_discovery; use crate::helpers::{get_veth_channels, read_perf_buffer}; use aya::{ Ebpf, - maps::{ - MapData, - perf::{PerfEventArray, PerfEventArrayBuffer}, - }, programs::{SchedClassifier, TcAttachType, tc::SchedClassifierLinkId}, util::online_cpus, }; @@ -25,8 +21,9 @@ use aya::{ #[cfg(feature = "experimental")] use crate::helpers::scan_cgroup_cronjob; -use bytes::BytesMut; -use cortexbrain_common::map_handlers::{init_bpf_maps, map_pinner, populate_blocklist}; +use cortexbrain_common::map_handlers::{ + init_bpf_maps, map_manager, map_pinner, populate_blocklist, +}; use cortexbrain_common::program_handlers::load_program; use cortexbrain_common::{buffer_type::BufferType, map_handlers::BpfMapsData}; use std::{ @@ -36,11 +33,11 @@ use std::{ }; use anyhow::{Context, Ok}; +use cortexbrain_common::buffer_type::BufferSize; use cortexbrain_common::{constants, logger}; -use tokio::{fs, signal}; -use tracing::{debug, error, info, warn}; - use std::collections::HashMap; +use tokio::{fs, signal}; +use tracing::{error, info}; #[tokio::main] async fn main() -> Result<(), anyhow::Error> { @@ -203,34 +200,11 @@ async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> { //TODO: try to change from PerfEventArray to a RingBuffer data structure - let mut map_manager = - HashMap::, Vec>)>::new(); - - // create the PerfEventArrays and the buffers from the BpfMapsData Objects - for (map, name) in bpf_maps - .bpf_obj_map - .into_iter() - .zip(bpf_maps.bpf_obj_names.into_iter()) - // zip two iterators at the same time for map and mapnames - { - debug!("Debugging map type:{:?} for map name {:?}", map, &name); - info!("Creating PerfEventArray for map name {:?}", &name); - - // save the map in a registry if is a PerfEventArray to access them by name - if let std::result::Result::Ok(perf_event_array) = PerfEventArray::try_from(map) { - map_manager.insert(name.clone(), (perf_event_array, Vec::new())); - - // perf_event_arrays.push(perf_event_array); // this is step 1 - // let perf_event_array_buffer = Vec::new(); - // event_buffers.push(perf_event_array_buffer); //this is step 2 - } else { - warn!("Map {:?} is not a PerfEventArray, skipping load", &name); - } - } + let mut maps = map_manager(bpf_maps)?; // fill the input buffers with data from the PerfEventArrays for cpu_id in online_cpus().map_err(|e| anyhow::anyhow!("Error {:?}", e))? { - for (name, (perf_evt_array, perf_evt_array_buffer)) in map_manager.iter_mut() { + for (name, (perf_evt_array, perf_evt_array_buffer)) in maps.iter_mut() { let buf = perf_evt_array.open(cpu_id, None)?; info!( "Buffer created for map {:?} on cpu_id {:?}. Buffer size: {}", @@ -245,23 +219,20 @@ async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> { info!("Listening for events..."); // i need to use remove to move the values from the Map Manager to the the async tasks - let (perf_veth_array, perf_veth_buffers) = map_manager + let (perf_veth_array, perf_veth_buffers) = maps .remove("veth_identity_map") .expect("Cannot create perf_veth buffer"); - let (perf_net_events_array, perf_net_events_buffers) = map_manager + let (perf_net_events_array, perf_net_events_buffers) = maps .remove("events_map") .expect("Cannot create perf_net_events buffer"); - let (tcp_registry_array, tcp_registry_buffers) = map_manager + let (tcp_registry_array, tcp_registry_buffers) = maps .remove("TcpPacketRegistry") .expect("Cannot create tcp_registry buffer"); // init output buffers - let veth_buffers = vec![BytesMut::with_capacity(10 * 1024); online_cpus().iter().len()]; - let events_buffers = vec![BytesMut::with_capacity(1024); online_cpus().iter().len()]; - let tcp_buffers = vec![BytesMut::with_capacity(1024); online_cpus().iter().len()]; - - // init veth link ids - //let veth_link_ids = link_ids; + let veth_buffers = BufferSize::VethEvents.set_buffer(); + let events_buffers = BufferSize::ClassifierNetEvents.set_buffer(); + let tcp_buffers = BufferSize::TcpEvents.set_buffer(); // spawn async tasks let veth_events_displayer = tokio::spawn(async move { diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index a67b607..e0ab006 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -1,5 +1,5 @@ use aya::{ - maps::{Map, MapData, PerfEventArray, perf::PerfEventArrayBuffer}, + maps::{MapData, perf::PerfEventArrayBuffer}, util::online_cpus, }; @@ -10,10 +10,14 @@ use std::sync::{ }; use tokio::signal; -use tracing::{debug, error, info, warn}; +use tracing::{error, info}; -use crate::structs::NetworkMetrics; -use crate::structs::TimeStampMetrics; +use cortexbrain_common::map_handlers::map_manager; +use cortexbrain_common::{ + buffer_type::{BufferSize, BufferType}, + buffer_type::{NetworkMetrics, TimeStampMetrics}, + map_handlers::BpfMapsData, +}; pub async fn display_metrics_map( mut perf_buffers: Vec>, @@ -119,50 +123,36 @@ pub async fn display_time_stamp_events_map( info!("Timestamp event listener stopped"); } -pub async fn event_listener(bpf_maps: Vec) -> Result<(), anyhow::Error> { +pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> { info!("Getting CPU count..."); - let mut perf_event_arrays = Vec::new(); // contains a vector of PerfEventArrays - let mut event_buffers = Vec::new(); // contains a vector of buffers - - info!("Creating perf buffers..."); - for map in bpf_maps { - debug!("Debugging map type:{:?}", map); - if let std::result::Result::Ok(perf_event_array) = PerfEventArray::try_from(map) { - perf_event_arrays.push(perf_event_array); // this is step 1 - let perf_event_array_buffer = Vec::new(); - event_buffers.push(perf_event_array_buffer); //this is step 2 - } else { - warn!("Map is not a PerfEventArray, skipping load"); - } - } + let mut maps = map_manager(bpf_maps)?; let cpu_count = online_cpus().map_err(|e| anyhow::anyhow!("Error {:?}", e))?; - //info!("CPU count: {}", cpu_count); - for (perf_evt_array, perf_evt_array_buffer) in - perf_event_arrays.iter_mut().zip(event_buffers.iter_mut()) - { - for cpu_id in &cpu_count { - let single_buffer = perf_evt_array.open(*cpu_id, None)?; - perf_evt_array_buffer.push(single_buffer); + for cpu_id in cpu_count { + for (name, (perf_event_array, perf_event_buffer)) in maps.iter_mut() { + let buf = perf_event_array.open(cpu_id, None)?; + perf_event_buffer.push(buf); } } - //info!("Opening perf buffers for {} CPUs...", cpu_count); info!("Perf buffers created successfully"); - let mut event_buffers = event_buffers.into_iter(); - let time_stamp_events_perf_buffer = event_buffers.next().expect(""); - let net_perf_buffer = event_buffers.next().expect(""); + let (time_stamp_events_array, time_stamp_events_perf_buffer) = maps + .remove("time_stamp_events") + .expect("Cannot create time_stamp_events_buffer"); + let (net_perf_array, net_perf_buffer) = maps + .remove("net_metrics") + .expect("Cannot create net_perf_buffer"); // Create shared running flags let net_metrics_running = Arc::new(AtomicBool::new(true)); let time_stamp_events_running = Arc::new(AtomicBool::new(true)); // Create proper sized buffers - let net_metrics_buffers = vec![BytesMut::with_capacity(1024); cpu_count.len()]; - let time_stamp_events_buffers = vec![BytesMut::with_capacity(1024); cpu_count.len()]; + let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); + let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); // Clone for the signal handler let net_metrics_running_signal = net_metrics_running.clone(); From 9575e8957e54e25208f696da6a964f833b1974f4 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Wed, 11 Feb 2026 21:39:29 +0100 Subject: [PATCH 02/17] [#158]: moved Monitoring structures to shared library --- core/common/src/buffer_type.rs | 286 +++++++++++++++++++- core/common/src/lib.rs | 7 +- core/src/components/identity/src/helpers.rs | 58 +--- core/src/components/metrics/src/structs.rs | 33 --- 4 files changed, 286 insertions(+), 98 deletions(-) delete mode 100644 core/src/components/metrics/src/structs.rs diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index 9fc7828..ad906ce 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -1,3 +1,4 @@ +use aya::{maps::perf::PerfEventArrayBuffer, util::online_cpus}; use bytemuck_derive::Zeroable; use bytes::BytesMut; use std::net::Ipv4Addr; @@ -54,19 +55,21 @@ unsafe impl aya::Pod for PacketLog {} #[cfg(feature = "network-structs")] #[repr(C, packed)] -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Zeroable)] pub struct VethLog { pub name: [u8; 16], // 16 bytes: veth interface name pub state: u64, // 8 bytes: state variable (unsigned long in kernel) - pub dev_addr: [u8; 6], // 32 bytes: device address + pub dev_addr: [u8; 6], // 6 bytes: device address pub event_type: u8, // 1 byte: 1 for veth creation, 2 for veth destruction pub netns: u32, // 4 bytes: network namespace inode number pub pid: u32, // 4 bytes: PID that triggered the event } +#[cfg(feature = "network-structs")] +unsafe impl aya::Pod for VethLog {} #[cfg(feature = "network-structs")] #[repr(C)] -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Zeroable)] pub struct TcpPacketRegistry { pub proto: u8, pub src_ip: u32, @@ -77,6 +80,47 @@ pub struct TcpPacketRegistry { pub command: [u8; 16], pub cgroup_id: u64, } +#[cfg(feature = "network-structs")] +unsafe impl aya::Pod for TcpPacketRegistry {} + +#[cfg(feature = "monitoring-structs")] +pub const TASK_COMM_LEN: usize = 16; // linux/sched.h +#[cfg(feature = "monitoring-structs")] +#[repr(C)] +#[derive(Clone, Copy, Zeroable)] +pub struct NetworkMetrics { + pub tgid: u32, + pub comm: [u8; TASK_COMM_LEN], + pub ts_us: u64, + pub sk_err: i32, // Offset 284 + pub sk_err_soft: i32, // Offset 600 + pub sk_backlog_len: i32, // Offset 196 + pub sk_write_memory_queued: i32, // Offset 376 + pub sk_receive_buffer_size: i32, // Offset 244 + pub sk_ack_backlog: u32, // Offset 604 + pub sk_drops: i32, // Offset 136 +} +#[cfg(feature = "monitoring-structs")] +unsafe impl aya::Pod for NetworkMetrics {} + +#[cfg(feature = "monitoring-structs")] +#[repr(C)] +#[derive(Clone, Copy, Zeroable)] +pub struct TimeStampMetrics { + pub delta_us: u64, + pub ts_us: u64, + pub tgid: u32, + pub comm: [u8; TASK_COMM_LEN], + pub lport: u16, + pub dport_be: u16, + pub af: u16, + pub saddr_v4: u32, + pub daddr_v4: u32, + pub saddr_v6: [u32; 4], + pub daddr_v6: [u32; 4], +} +#[cfg(feature = "monitoring-structs")] +unsafe impl aya::Pod for TimeStampMetrics {} // docs: // This function perform a byte swap from little-endian to big-endian @@ -95,15 +139,23 @@ pub fn reverse_be_addr(addr: u32) -> Ipv4Addr { // enum BuffersType #[cfg(feature = "buffer-reader")] pub enum BufferType { + #[cfg(feature = "network-structs")] PacketLog, + #[cfg(feature = "network-structs")] TcpPacketRegistry, + #[cfg(feature = "network-structs")] VethLog, + #[cfg(feature = "monitoring-structs")] + NetworkMetrics, + #[cfg(feature = "monitoring-structs")] + TimeStampMetrics, } // IDEA: this is an experimental implementation to centralize buffer reading logic // TODO: add variant for cortexflow API exporter #[cfg(feature = "buffer-reader")] impl BufferType { + #[cfg(feature = "network-structs")] pub async fn read_packet_log(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { for i in offset..tot_events { let vec_bytes = &buffers[i as usize]; @@ -147,6 +199,7 @@ impl BufferType { } } } + #[cfg(feature = "network-structs")] pub async fn read_tcp_registry_log(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { for i in offset..tot_events { let vec_bytes = &buffers[i as usize]; @@ -204,11 +257,8 @@ impl BufferType { } } } - pub async fn read_and_handle_veth_log( - buffers: &mut [BytesMut], - tot_events: i32, - offset: i32, - ) { + #[cfg(feature = "network-structs")] + pub async fn read_and_handle_veth_log(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { for i in offset..tot_events { let vec_bytes = &buffers[i as usize]; if vec_bytes.len() < std::mem::size_of::() { @@ -289,4 +339,224 @@ impl BufferType { } } } + #[cfg(feature = "monitoring-structs")] + pub async fn read_network_metrics(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted Network Metrics data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let net_metrics: NetworkMetrics = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + let tgid = net_metrics.tgid; + let comm = String::from_utf8_lossy(&net_metrics.comm); + let ts_us = net_metrics.ts_us; + let sk_drop_count = net_metrics.sk_drops; + let sk_err = net_metrics.sk_err; + let sk_err_soft = net_metrics.sk_err_soft; + let sk_backlog_len = net_metrics.sk_backlog_len; + let sk_write_memory_queued = net_metrics.sk_write_memory_queued; + let sk_ack_backlog = net_metrics.sk_ack_backlog; + let sk_receive_buffer_size = net_metrics.sk_receive_buffer_size; + + info!( + "tgid: {}, comm: {}, ts_us: {}, sk_drops: {}, sk_err: {}, sk_err_soft: {}, sk_backlog_len: {}, sk_write_memory_queued: {}, sk_ack_backlog: {}, sk_receive_buffer_size: {}", + tgid, + comm, + ts_us, + sk_drop_count, + sk_err, + sk_err_soft, + sk_backlog_len, + sk_write_memory_queued, + sk_ack_backlog, + sk_receive_buffer_size + ); + } + } + } + #[cfg(feature = "monitoring-structs")] + pub async fn read_timestamp_metrics(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted Network Metrics data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let time_stamp_event: TimeStampMetrics = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + let delta_us = time_stamp_event.delta_us; + let ts_us = time_stamp_event.ts_us; + let tgid = time_stamp_event.tgid; + let comm = String::from_utf8_lossy(&time_stamp_event.comm); + let lport = time_stamp_event.lport; + let dport_be = time_stamp_event.dport_be; + let af = time_stamp_event.af; + info!( + "TimeStampEvent - delta_us: {}, ts_us: {}, tgid: {}, comm: {}, lport: {}, dport_be: {}, af: {}", + delta_us, ts_us, tgid, comm, lport, dport_be, af + ); + } + } + } +} + +// docs: read buffer function: +// template function that take a mut perf_event_array_buffer of type T and a mutable buffer of Vec +#[cfg(feature = "buffer-reader")] +pub async fn read_perf_buffer>( + mut array_buffers: Vec>, + mut buffers: Vec, + buffer_type: BufferType, +) { + // loop over the buffers + loop { + for buf in array_buffers.iter_mut() { + match buf.read_events(&mut buffers) { + Ok(events) => { + // triggered if some events are lost + if events.lost > 0 { + tracing::debug!("Lost events: {} ", events.lost); + } + // triggered if some events are readed + if events.read > 0 { + tracing::debug!("Readed events: {}", events.read); + let offset = 0; + let tot_events = events.read as i32; + + //read the events in the buffer + match buffer_type { + #[cfg(feature = "network-structs")] + BufferType::PacketLog => { + BufferType::read_packet_log(&mut buffers, tot_events, offset).await + } + #[cfg(feature = "network-structs")] + BufferType::TcpPacketRegistry => { + BufferType::read_tcp_registry_log(&mut buffers, tot_events, offset) + .await + } + #[cfg(feature = "network-structs")] + BufferType::VethLog => { + BufferType::read_and_handle_veth_log( + &mut buffers, + tot_events, + offset, + ) + .await + } + #[cfg(feature = "monitoring-structs")] + BufferType::NetworkMetrics => { + BufferType::read_network_metrics(&mut buffers, tot_events, offset) + .await + } + #[cfg(feature = "monitoring-structs")] + BufferType::TimeStampMetrics => { + BufferType::read_timestamp_metrics(&mut buffers, tot_events, offset) + .await + } + } + } + } + Err(e) => { + error!("Cannot read events from buffer. Reason: {} ", e); + } + } + } + tokio::time::sleep(std::time::Duration::from_millis(100)).await; // small sleep + } +} + +#[cfg(feature = "buffer-reader")] +pub enum BufferSize { + #[cfg(feature = "network-structs")] + ClassifierNetEvents, + #[cfg(feature = "network-structs")] + VethEvents, + #[cfg(feature = "network-structs")] + TcpEvents, + #[cfg(feature = "monitoring-structs")] + NetworkMetricsEvents, + #[cfg(feature = "monitoring-structs")] + TimeMetricsEvents, +} +#[cfg(feature = "buffer-reader")] +impl BufferSize { + pub fn get_size(&self) -> usize { + match self { + #[cfg(feature = "network-structs")] + BufferSize::ClassifierNetEvents => std::mem::size_of::(), + #[cfg(feature = "network-structs")] + BufferSize::VethEvents => std::mem::size_of::(), + #[cfg(feature = "network-structs")] + BufferSize::TcpEvents => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::NetworkMetricsEvents => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::TimeMetricsEvents => std::mem::size_of::(), + } + } + pub fn set_buffer(&self) -> Vec { + // iter returns and iterator of cpu ids, + // we need only the total number of cpus to set the buffer size so we use .len() to get + // the count of total cpus and then we allocate a buffer for each cpu with a capacity + // based on the structure size * a factor to have a bigger buffer to avoid overflows and lost events + + // Old buffers where 1024 bytes long. Now we set different buffer size based on + // the frequence of the events. + // ClassifierNetEvents are triggered by the TC classifier program, events has high frequency + // VethEvents are triggered by the creation and deletion of veth interfaces, events has small frequency compared to classifier events + // TcpEvents are triggered by TCP events and connections. Events has similar frequency to ClassifierNetEvents. + + let tot_cpu = online_cpus().iter().len(); // total number of cpus + + // TODO: finish to do all the calculations for the buffer sizes + match self { + #[cfg(feature = "network-structs")] + BufferSize::ClassifierNetEvents => { + let capacity = self.get_size() * 200; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } + #[cfg(feature = "network-structs")] + BufferSize::VethEvents => { + let capacity = self.get_size() * 100; // Allocates 4Kb of memory for the buffers + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } + #[cfg(feature = "network-structs")] + BufferSize::TcpEvents => { + let capacity = self.get_size() * 200; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } + #[cfg(feature = "monitoring-structs")] + BufferSize::NetworkMetricsEvents => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } + #[cfg(feature = "monitoring-structs")] + BufferSize::TimeMetricsEvents => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } + } + } } diff --git a/core/common/src/lib.rs b/core/common/src/lib.rs index d88c1db..d7e48b0 100644 --- a/core/common/src/lib.rs +++ b/core/common/src/lib.rs @@ -1,5 +1,8 @@ -#[cfg(feature = "buffer-reader")] -#[cfg(feature = "network-structs")] +#[cfg(any( + feature = "buffer-reader", + feature = "network-structs", + feature = "monitoring-structs" +))] pub mod buffer_type; pub mod constants; pub mod formatters; diff --git a/core/src/components/identity/src/helpers.rs b/core/src/components/identity/src/helpers.rs index bd76a29..50414bf 100644 --- a/core/src/components/identity/src/helpers.rs +++ b/core/src/components/identity/src/helpers.rs @@ -1,14 +1,13 @@ -use aya::maps::perf::PerfEventArrayBuffer; -use cortexbrain_common::buffer_type::BufferType; use nix::net::if_::if_nameindex; use std::result::Result::Ok; -use tracing::{error, info}; +use tracing::info; // docs: // This function checks if the given interface name is in the list of ignored interfaces // Takes a interface name (iface) as &str and returns true if the interface should be ignored // Typically we want to ignore eth0,docker0,tunl0,lo interfaces because they are not relevant for the internal monitoring // +#[inline(always)] pub fn ignore_iface(iface: &str) -> bool { let ignored_interfaces = ["eth0", "docker0", "tunl0", "lo"]; ignored_interfaces.contains(&iface) @@ -18,6 +17,7 @@ pub fn ignore_iface(iface: &str) -> bool { // This function retrieves the list of veth interfaces on the system, filtering out ignored interfaces with // the ignore_iface function. // +#[inline(always)] pub fn get_veth_channels() -> Vec { //filter interfaces and save the output in the let mut interfaces: Vec = Vec::new(); @@ -36,58 +36,6 @@ pub fn get_veth_channels() -> Vec { interfaces } -// docs: read buffer function: -// template function that take a mut perf_event_array_buffer of type T and a mutable buffer of Vec - -pub async fn read_perf_buffer>( - mut array_buffers: Vec>, - mut buffers: Vec, - buffer_type: BufferType, -) { - // loop over the buffers - loop { - for buf in array_buffers.iter_mut() { - match buf.read_events(&mut buffers) { - Ok(events) => { - // triggered if some events are lost - if events.lost > 0 { - tracing::debug!("Lost events: {} ", events.lost); - } - // triggered if some events are readed - if events.read > 0 { - tracing::debug!("Readed events: {}", events.read); - let offset = 0; - let tot_events = events.read as i32; - - //read the events in the buffer - match buffer_type { - BufferType::PacketLog => { - BufferType::read_packet_log(&mut buffers, tot_events, offset).await - } - BufferType::TcpPacketRegistry => { - BufferType::read_tcp_registry_log(&mut buffers, tot_events, offset) - .await - } - BufferType::VethLog => { - BufferType::read_and_handle_veth_log( - &mut buffers, - tot_events, - offset, - ) - .await - } - } - } - } - Err(e) => { - error!("Cannot read events from buffer. Reason: {} ", e); - } - } - } - tokio::time::sleep(std::time::Duration::from_millis(100)).await; // small sleep - } -} - #[cfg(test)] mod tests { use cortexbrain_common::buffer_type::VethLog; diff --git a/core/src/components/metrics/src/structs.rs b/core/src/components/metrics/src/structs.rs deleted file mode 100644 index dc63ace..0000000 --- a/core/src/components/metrics/src/structs.rs +++ /dev/null @@ -1,33 +0,0 @@ - -pub const TASK_COMM_LEN: usize = 16; // linux/sched.h - -#[repr(C, packed)] -#[derive(Clone, Copy)] -pub struct NetworkMetrics { - pub tgid: u32, - pub comm: [u8; TASK_COMM_LEN], - pub ts_us: u64, - pub sk_err: i32, // Offset 284 - pub sk_err_soft: i32, // Offset 600 - pub sk_backlog_len: i32, // Offset 196 - pub sk_write_memory_queued: i32, // Offset 376 - pub sk_receive_buffer_size: i32, // Offset 244 - pub sk_ack_backlog: u32, // Offset 604 - pub sk_drops: i32, // Offset 136 -} - -#[repr(C)] -#[derive(Clone, Copy)] -pub struct TimeStampMetrics { - pub delta_us: u64, - pub ts_us: u64, - pub tgid: u32, - pub comm: [u8; TASK_COMM_LEN], - pub lport: u16, - pub dport_be: u16, - pub af: u16, - pub saddr_v4: u32, - pub daddr_v4: u32, - pub saddr_v6: [u32; 4], - pub daddr_v6: [u32; 4], -} \ No newline at end of file From 9881df307f0d44b3d1c98e2005ac66b8b0691508 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Wed, 11 Feb 2026 21:42:06 +0100 Subject: [PATCH 03/17] [#175]: added otlp provider to metrics service. Simplified map handling and added read_perf_buffer function --- core/common/Cargo.toml | 1 + core/src/components/metrics/Cargo.toml | 5 +- core/src/components/metrics/src/helpers.rs | 148 ++------------------- core/src/components/metrics/src/main.rs | 8 +- 4 files changed, 22 insertions(+), 140 deletions(-) diff --git a/core/common/Cargo.toml b/core/common/Cargo.toml index f604b65..ee50e2b 100644 --- a/core/common/Cargo.toml +++ b/core/common/Cargo.toml @@ -24,6 +24,7 @@ opentelemetry-otlp = { version = "0.31.0", features = ["logs", "grpc-tonic"] } bytemuck = "1.25.0" bytes = "1.11.0" bytemuck_derive = "1.10.2" +tokio = "1.49.0" [features] map-handlers = [] diff --git a/core/src/components/metrics/Cargo.toml b/core/src/components/metrics/Cargo.toml index 0e88d8c..c8dcb5b 100644 --- a/core/src/components/metrics/Cargo.toml +++ b/core/src/components/metrics/Cargo.toml @@ -20,8 +20,11 @@ tracing = "0.1.41" tracing-subscriber = { version = "0.3.19", features = ["env-filter"] } libc = "0.2.172" bytemuck = "1.23.0" -cortexbrain-common = { path = "../../../common", features = [ +cortexbrain-common = { path = "../../../common/", features = [ "map-handlers", "program-handlers", + "buffer-reader", + "monitoring-structs", + "network-structs" ] } nix = { version = "0.30.1", features = ["net"] } diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index e0ab006..0968113 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -1,127 +1,11 @@ -use aya::{ - maps::{MapData, perf::PerfEventArrayBuffer}, - util::online_cpus, -}; - -use bytes::BytesMut; -use std::sync::{ - Arc, - atomic::{AtomicBool, Ordering}, -}; -use tokio::signal; - -use tracing::{error, info}; - +use aya::util::online_cpus; use cortexbrain_common::map_handlers::map_manager; use cortexbrain_common::{ - buffer_type::{BufferSize, BufferType}, - buffer_type::{NetworkMetrics, TimeStampMetrics}, + buffer_type::{BufferSize, BufferType, read_perf_buffer}, map_handlers::BpfMapsData, }; - -pub async fn display_metrics_map( - mut perf_buffers: Vec>, - running: Arc, // Changed to Arc - mut buffers: Vec, -) { - info!("Starting metrics event listener..."); - while running.load(Ordering::SeqCst) { - for buf in perf_buffers.iter_mut() { - match buf.read_events(&mut buffers) { - std::result::Result::Ok(events) => { - if events.read > 0 { - info!("Read {} metric events", events.read); - } - for i in 0..events.read { - let data = &buffers[i]; - if data.len() >= std::mem::size_of::() { - let net_metrics: NetworkMetrics = - unsafe { std::ptr::read_unaligned(data.as_ptr() as *const _) }; - let tgid = net_metrics.tgid; - let comm = String::from_utf8_lossy(&net_metrics.comm); - let ts_us = net_metrics.ts_us; - let sk_drop_count = net_metrics.sk_drops; - let sk_err = net_metrics.sk_err; - let sk_err_soft = net_metrics.sk_err_soft; - let sk_backlog_len = net_metrics.sk_backlog_len; - let sk_write_memory_queued = net_metrics.sk_write_memory_queued; - let sk_ack_backlog = net_metrics.sk_ack_backlog; - let sk_receive_buffer_size = net_metrics.sk_receive_buffer_size; - info!( - "tgid: {}, comm: {}, ts_us: {}, sk_drops: {}, sk_err: {}, sk_err_soft: {}, sk_backlog_len: {}, sk_write_memory_queued: {}, sk_ack_backlog: {}, sk_receive_buffer_size: {}", - tgid, - comm, - ts_us, - sk_drop_count, - sk_err, - sk_err_soft, - sk_backlog_len, - sk_write_memory_queued, - sk_ack_backlog, - sk_receive_buffer_size - ); - } else { - info!( - "Received data too small: {} bytes, expected: {}", - data.len(), - std::mem::size_of::() - ); - } - } - } - Err(e) => { - error!("Error reading events: {:?}", e); - } - } - } - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - info!("Metrics event listener stopped"); -} - -pub async fn display_time_stamp_events_map( - mut perf_buffers: Vec>, - running: Arc, // Changed to Arc - mut buffers: Vec, -) { - info!("Starting timestamp event listener..."); - while running.load(Ordering::SeqCst) { - for buf in perf_buffers.iter_mut() { - match buf.read_events(&mut buffers) { - std::result::Result::Ok(events) => { - if events.read > 0 { - info!("Read {} timestamp events", events.read); - } - for i in 0..events.read { - let data = &buffers[i]; - if data.len() >= std::mem::size_of::() { - let time_stamp_event: TimeStampMetrics = - unsafe { std::ptr::read_unaligned(data.as_ptr() as *const _) }; - let delta_us = time_stamp_event.delta_us; - let ts_us = time_stamp_event.ts_us; - let tgid = time_stamp_event.tgid; - let comm = String::from_utf8_lossy(&time_stamp_event.comm); - let lport = time_stamp_event.lport; - let dport_be = time_stamp_event.dport_be; - let af = time_stamp_event.af; - info!( - "TimeStampEvent - delta_us: {}, ts_us: {}, tgid: {}, comm: {}, lport: {}, dport_be: {}, af: {}", - delta_us, ts_us, tgid, comm, lport, dport_be, af - ); - } else { - info!("Received timestamp data too small: {} bytes", data.len()); - } - } - } - Err(e) => { - error!("Error reading timestamp events: {:?}", e); - } - } - } - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - info!("Timestamp event listener stopped"); -} +use tokio::signal; +use tracing::{error, info}; pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> { info!("Getting CPU count..."); @@ -146,30 +30,27 @@ pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> .remove("net_metrics") .expect("Cannot create net_perf_buffer"); - // Create shared running flags - let net_metrics_running = Arc::new(AtomicBool::new(true)); - let time_stamp_events_running = Arc::new(AtomicBool::new(true)); - // Create proper sized buffers let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); - // Clone for the signal handler - let net_metrics_running_signal = net_metrics_running.clone(); - let time_stamp_events_running_signal = time_stamp_events_running.clone(); - info!("Starting event listener tasks..."); let metrics_map_displayer = tokio::spawn(async move { - display_metrics_map(net_perf_buffer, net_metrics_running, net_metrics_buffers).await; + read_perf_buffer( + net_perf_buffer, + net_metrics_buffers, + BufferType::NetworkMetrics, + ) + .await; }); let time_stamp_events_displayer = tokio::spawn(async move { - display_time_stamp_events_map( + read_perf_buffer( time_stamp_events_perf_buffer, - time_stamp_events_running, time_stamp_events_buffers, + BufferType::TimeStampMetrics, ) - .await + .await; }); info!("Event listeners started, entering main loop..."); @@ -189,9 +70,6 @@ pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> _ = signal::ctrl_c() => { info!("Ctrl-C received, shutting down..."); - // Stop the event loops - net_metrics_running_signal.store(false, std::sync::atomic::Ordering::SeqCst); - time_stamp_events_running_signal.store(false, std::sync::atomic::Ordering::SeqCst); } } diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index e8677fb..e6c9069 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -1,6 +1,6 @@ use anyhow::{Context, Ok}; use aya::Ebpf; -use cortexbrain_common::{constants, logger}; +use cortexbrain_common::constants; use std::{ env, fs, path::Path, @@ -11,15 +11,14 @@ use tracing::{error, info}; mod helpers; use crate::helpers::event_listener; +use cortexbrain_common::logger::otlp_logger_init; use cortexbrain_common::map_handlers::{init_bpf_maps, map_pinner}; use cortexbrain_common::program_handlers::load_program; -mod structs; - #[tokio::main] async fn main() -> Result<(), anyhow::Error> { //init tracing subscriber - logger::init_default_logger(); + let otlp_provider = otlp_logger_init("metrics-service".to_string()); info!("Starting metrics service..."); info!("fetching data"); @@ -78,6 +77,7 @@ async fn main() -> Result<(), anyhow::Error> { } Err(e) => { error!("Error initializing BPF maps: {:?}", e); + let _ = otlp_provider.shutdown(); return Err(e); } } From b05d9b9be81695462dc8b12d488b188040523b15 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Wed, 11 Feb 2026 21:56:34 +0100 Subject: [PATCH 04/17] [#158]: fixed imports from the common crate --- core/src/components/identity/src/main.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/components/identity/src/main.rs b/core/src/components/identity/src/main.rs index c70011e..d42564a 100644 --- a/core/src/components/identity/src/main.rs +++ b/core/src/components/identity/src/main.rs @@ -11,7 +11,7 @@ mod helpers; mod service_discovery; -use crate::helpers::{get_veth_channels, read_perf_buffer}; +use crate::helpers::get_veth_channels; use aya::{ Ebpf, programs::{SchedClassifier, TcAttachType, tc::SchedClassifierLinkId}, @@ -21,6 +21,7 @@ use aya::{ #[cfg(feature = "experimental")] use crate::helpers::scan_cgroup_cronjob; +use cortexbrain_common::buffer_type::read_perf_buffer; use cortexbrain_common::map_handlers::{ init_bpf_maps, map_manager, map_pinner, populate_blocklist, }; From a32698be18a2abf92f7636db42e45b8463c58309 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Wed, 11 Feb 2026 21:57:01 +0100 Subject: [PATCH 05/17] added TODOs in conntracker kernel module --- core/src/components/conntracker/src/main.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/components/conntracker/src/main.rs b/core/src/components/conntracker/src/main.rs index e723e4b..8438838 100644 --- a/core/src/components/conntracker/src/main.rs +++ b/core/src/components/conntracker/src/main.rs @@ -32,6 +32,10 @@ use crate::tc::try_identity_classifier; use crate::tcp_analyzer::try_tcp_analyzer; use crate::veth_tracer::try_veth_tracer; +// TODO: add function to track +// 1. kprobe:tcp_enter_memory_pressure +// 2. kprobe:tcp_create_openreq_child (https://elixir.bootlin.com/linux/v6.18.6/source/net/ipv4/tcp_ipv4.c#L1776) [function: *tcp_v4_syn_recv_sock] + // docs: // // virtual ethernet (veth) interface tracer: From 398bff0e337b3a023b6f6df13178836e9f79832c Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Wed, 11 Feb 2026 22:39:06 +0100 Subject: [PATCH 06/17] [#175]: added repr(C,packed) for monitoring structures. Fixed imports. Added error handling in event listener --- core/common/src/buffer_type.rs | 4 ++-- core/src/components/identity/src/main.rs | 23 +++++++++++-------- core/src/components/metrics/src/helpers.rs | 14 ++++++++++- core/src/components/metrics/src/main.rs | 10 ++++---- .../metrics_tracer/src/data_structures.rs | 6 ++--- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index ad906ce..ac0d600 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -86,7 +86,7 @@ unsafe impl aya::Pod for TcpPacketRegistry {} #[cfg(feature = "monitoring-structs")] pub const TASK_COMM_LEN: usize = 16; // linux/sched.h #[cfg(feature = "monitoring-structs")] -#[repr(C)] +#[repr(C, packed)] #[derive(Clone, Copy, Zeroable)] pub struct NetworkMetrics { pub tgid: u32, @@ -104,7 +104,7 @@ pub struct NetworkMetrics { unsafe impl aya::Pod for NetworkMetrics {} #[cfg(feature = "monitoring-structs")] -#[repr(C)] +#[repr(C, packed)] #[derive(Clone, Copy, Zeroable)] pub struct TimeStampMetrics { pub delta_us: u64, diff --git a/core/src/components/identity/src/main.rs b/core/src/components/identity/src/main.rs index d42564a..4efa3c9 100644 --- a/core/src/components/identity/src/main.rs +++ b/core/src/components/identity/src/main.rs @@ -21,21 +21,21 @@ use aya::{ #[cfg(feature = "experimental")] use crate::helpers::scan_cgroup_cronjob; -use cortexbrain_common::buffer_type::read_perf_buffer; -use cortexbrain_common::map_handlers::{ - init_bpf_maps, map_manager, map_pinner, populate_blocklist, +use cortexbrain_common::{ + buffer_type::{BufferSize, BufferType, read_perf_buffer}, + constants, logger, + map_handlers::BpfMapsData, + map_handlers::{init_bpf_maps, map_manager, map_pinner, populate_blocklist}, + program_handlers::load_program, }; -use cortexbrain_common::program_handlers::load_program; -use cortexbrain_common::{buffer_type::BufferType, map_handlers::BpfMapsData}; use std::{ convert::TryInto, path::Path, sync::{Arc, Mutex}, }; -use anyhow::{Context, Ok}; -use cortexbrain_common::buffer_type::BufferSize; -use cortexbrain_common::{constants, logger}; +use anyhow::{Context, Ok, anyhow}; + use std::collections::HashMap; use tokio::{fs, signal}; use tracing::{error, info}; @@ -206,7 +206,12 @@ async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> { // fill the input buffers with data from the PerfEventArrays for cpu_id in online_cpus().map_err(|e| anyhow::anyhow!("Error {:?}", e))? { for (name, (perf_evt_array, perf_evt_array_buffer)) in maps.iter_mut() { - let buf = perf_evt_array.open(cpu_id, None)?; + let buf = perf_evt_array.open(cpu_id, None).map_err(|e| { + anyhow!( + "Cannot create perf_event_array buffer from perf_event_array. Reason: {}", + e + ) + })?; info!( "Buffer created for map {:?} on cpu_id {:?}. Buffer size: {}", name, diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index 0968113..843f45d 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -1,3 +1,4 @@ +use anyhow::anyhow; use aya::util::online_cpus; use cortexbrain_common::map_handlers::map_manager; use cortexbrain_common::{ @@ -16,7 +17,18 @@ pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> for cpu_id in cpu_count { for (name, (perf_event_array, perf_event_buffer)) in maps.iter_mut() { - let buf = perf_event_array.open(cpu_id, None)?; + let buf = perf_event_array.open(cpu_id, None).map_err(|e| { + anyhow!( + "Cannot create perf_event_array buffer from perf_event_array. Reason: {}", + e + ) + })?; + info!( + "Buffer created for map {:?} on cpu_id {:?}. Buffer size: {}", + name, + cpu_id, + std::mem::size_of_val(&buf) + ); perf_event_buffer.push(buf); } } diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index e6c9069..e5558eb 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -1,6 +1,5 @@ use anyhow::{Context, Ok}; use aya::Ebpf; -use cortexbrain_common::constants; use std::{ env, fs, path::Path, @@ -11,9 +10,12 @@ use tracing::{error, info}; mod helpers; use crate::helpers::event_listener; -use cortexbrain_common::logger::otlp_logger_init; -use cortexbrain_common::map_handlers::{init_bpf_maps, map_pinner}; -use cortexbrain_common::program_handlers::load_program; +use cortexbrain_common::{ + constants, + logger::otlp_logger_init, + map_handlers::{init_bpf_maps, map_pinner}, + program_handlers::load_program, +}; #[tokio::main] async fn main() -> Result<(), anyhow::Error> { diff --git a/core/src/components/metrics_tracer/src/data_structures.rs b/core/src/components/metrics_tracer/src/data_structures.rs index f6d7afe..e9866a8 100644 --- a/core/src/components/metrics_tracer/src/data_structures.rs +++ b/core/src/components/metrics_tracer/src/data_structures.rs @@ -2,7 +2,7 @@ use aya_ebpf::{macros::map, maps::{LruPerCpuHashMap, HashMap, PerfEventArray}}; pub const TASK_COMM_LEN: usize = 16; - +#[repr(C,packed)] pub struct NetworkMetrics { pub tgid: u32, pub comm: [u8; TASK_COMM_LEN], @@ -16,7 +16,7 @@ pub struct NetworkMetrics { pub sk_drops: i32, // Offset 136 } -#[repr(C)] +#[repr(C,packed)] #[derive(Copy, Clone)] pub struct TimeStampStartInfo { pub comm: [u8; TASK_COMM_LEN], @@ -25,7 +25,7 @@ pub struct TimeStampStartInfo { } // Event we send to userspace when latency is computed -#[repr(C)] +#[repr(C,packed)] #[derive(Copy, Clone)] pub struct TimeStampEvent { pub delta_us: u64, From 01c63c4c6bb830c0383c112fdf3e2c6cc37c9e25 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Fri, 13 Feb 2026 22:11:40 +0100 Subject: [PATCH 07/17] [#158]: added control to skip load of blocklist if the addresses vector is empty. Added comments and annotations --- core/common/src/map_handlers.rs | 8 ++++++-- core/common/src/program_handlers.rs | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/core/common/src/map_handlers.rs b/core/common/src/map_handlers.rs index 313f73e..b246b70 100644 --- a/core/common/src/map_handlers.rs +++ b/core/common/src/map_handlers.rs @@ -121,8 +121,11 @@ pub async fn populate_blocklist() -> Result<(), Error> { .filter(|s| !s.is_empty()) .collect(); //String parsing from "x y" to ["x","y"] - info!("Inserting addresses: {:?}", addresses); - for item in addresses { + if addresses.is_empty() { + warn!("No addresses found in the blocklist. Skipping load"); + } + for item in &addresses { + info!("Inserting addresses: {:?}", &item); let addr = Ipv4Addr::from_str(&item)?.octets(); let _ = blocklist_map.insert(addr, addr, 0); } @@ -138,6 +141,7 @@ pub async fn populate_blocklist() -> Result<(), Error> { } #[cfg(feature = "map-handlers")] +// TODO: modify this to accept also HashMap types pub fn load_perf_event_array_from_mapdata( path: &'static str, ) -> Result, Error> { diff --git a/core/common/src/program_handlers.rs b/core/common/src/program_handlers.rs index 42cd3ba..347be51 100644 --- a/core/common/src/program_handlers.rs +++ b/core/common/src/program_handlers.rs @@ -13,32 +13,38 @@ pub fn load_program( .lock() .map_err(|e| anyhow::anyhow!("Cannot get value from lock. Reason: {}", e))?; - // Load and attach the eBPF programs + // Load and attach the eBPF program let program: &mut KProbe = bpf_new .program_mut(program_name) .ok_or_else(|| anyhow::anyhow!("Program {} not found", program_name))? .try_into() .map_err(|e| anyhow::anyhow!("Failed to convert program: {:?}", e))?; + // STEP 1: load program + program .load() .map_err(|e| anyhow::anyhow!("Cannot load program: {}. Error: {}", &program_name, e))?; + // STEP 2: Attach the loaded program to kernel symbol match program.attach(kernel_symbol, 0) { - Ok(_) => info!("{} program attached successfully", kernel_symbol), + Ok(_) => info!( + "{} program attached successfully to kernel symbol {}", + &program_name, &kernel_symbol + ), Err(e) => { - error!("Error attaching {} program {:?}", kernel_symbol, e); + error!( + "Error attaching {} program to kernel symbol {}. Reason: {:?}", + &program_name, &kernel_symbol, e + ); return Err(anyhow::anyhow!( - "Failed to attach {}: {:?}", - kernel_symbol, + "Failed to attach program {} to kernel symbol {}. Reason {:?}", + &program_name, + &kernel_symbol, e )); } }; - info!( - "eBPF program {} loaded and attached successfully", - program_name - ); Ok(()) } From 147802f11111dda6d9b7b5c4cf74eedf88913377 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Fri, 13 Feb 2026 22:14:25 +0100 Subject: [PATCH 08/17] [#158]: added shared hashmap to store tracked veth (TRACKED_VETH). The shared map is used to store the veth names and the status (attached or not) during the startup --- .../conntracker/src/data_structures.rs | 10 ++- core/src/components/identity/src/main.rs | 69 ++++++++++++++++--- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/core/src/components/conntracker/src/data_structures.rs b/core/src/components/conntracker/src/data_structures.rs index f4c5047..c55cd3f 100644 --- a/core/src/components/conntracker/src/data_structures.rs +++ b/core/src/components/conntracker/src/data_structures.rs @@ -47,7 +47,7 @@ pub struct ConnArray { // pid: kernel process ID // -#[repr(C,packed)] +#[repr(C, packed)] #[derive(Clone, Copy)] pub struct VethLog { pub name: [u8; 16], // 16 bytes: veth interface name @@ -94,9 +94,13 @@ pub static mut CONNTRACKER: LruPerCpuHashMap = pub static mut VETH_EVENTS: PerfEventArray = PerfEventArray::new(0); #[map(name = "Blocklist", pinning = "by_name")] -pub static mut BLOCKLIST: HashMap<[u8; 4], [u8; 4]> = - HashMap::<[u8; 4], [u8; 4]>::with_max_entries(1024, 0); +pub static mut BLOCKLIST: HashMap<[u8; 4], [u8; 4]> = HashMap::with_max_entries(1024, 0); //here i need to pass an address like this: [135,171,168,192] #[map(name = "TcpPacketRegistry", pinning = "by_name")] pub static mut PACKET_REGISTRY: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "tracked_veth", pinning = "by_name")] +// This map takes a registry of tracked veth interfaces +// The maximum number of characters is 16 of type u8 +pub static mut TRACKED_VETH: HashMap<[u8; 16], [u8; 8]> = HashMap::with_max_entries(1024, 0); diff --git a/core/src/components/identity/src/main.rs b/core/src/components/identity/src/main.rs index 4efa3c9..8d13e22 100644 --- a/core/src/components/identity/src/main.rs +++ b/core/src/components/identity/src/main.rs @@ -14,7 +14,8 @@ mod service_discovery; use crate::helpers::get_veth_channels; use aya::{ Ebpf, - programs::{SchedClassifier, TcAttachType, tc::SchedClassifierLinkId}, + maps::{Map, MapData}, + programs::{SchedClassifier, TcAttachType}, util::online_cpus, }; @@ -36,7 +37,7 @@ use std::{ use anyhow::{Context, Ok, anyhow}; -use std::collections::HashMap; +//use std::collections::HashMap; use tokio::{fs, signal}; use tracing::{error, info}; @@ -49,7 +50,7 @@ async fn main() -> Result<(), anyhow::Error> { info!("fetching data"); // To Store link_ids they can be used to detach tc - let link_ids = Arc::new(Mutex::new(HashMap::::new())); + //let mut link_ids = HashMap::::new(); //init conntracker data path let bpf_path = @@ -67,6 +68,7 @@ async fn main() -> Result<(), anyhow::Error> { "veth_identity_map".to_string(), "TcpPacketRegistry".to_string(), "Blocklist".to_string(), + "tracked_veth".to_string(), ]; match init_bpf_maps(bpf.clone(), map_data) { std::result::Result::Ok(bpf_maps) => { @@ -90,8 +92,8 @@ async fn main() -> Result<(), anyhow::Error> { } { - init_tc_classifier(bpf.clone(), interfaces, link_ids.clone()).await.context( - "An error occured during the execution of attach_bpf_program function" + init_tc_classifier(bpf.clone(), interfaces).await.context( + "An error occured during the execution of attach_bpf_program function", )?; } { @@ -120,10 +122,10 @@ async fn main() -> Result<(), anyhow::Error> { } //attach the tc classifier program to a vector of interfaces +// TODO: consider to create a load schedule classifier in the common functions async fn init_tc_classifier( bpf: Arc>, ifaces: Vec, - link_ids: Arc>>, ) -> Result<(), anyhow::Error> { //this funtion initialize the tc classifier program info!("Loading programs"); @@ -138,10 +140,33 @@ async fn init_tc_classifier( .try_into() .context("Failed to init SchedClassifier program")?; + // load classifier program + program .load() .context("Failed to load identity_classifier program")?; + // attach program only to desired interfaces. We can skip the dock0,tunl0,lo and eth0 interface + // we also save the interfaces to a BPF_HASH_MAP to easily monitor the interfaces using the agent + + // decleare link_ids HashMap which is a shared hashmap between kernel and userspace + // Link_ids hashmap has type of HashMap<[u8; 16], [u8; 8]>. The key is the program name and the value is the state + + // at this point the pinning is already successfull so we can invoque the maps from the pin + + let link_ids_mapdata = MapData::from_pin("/sys/fs/bpf/maps/tracked_veth") + .map_err(|e| anyhow!("Cannot return link_ids_mapdata. Reason: {}", e))?; + + let link_ids_map = Map::HashMap(link_ids_mapdata); + + let mut link_ids: aya::maps::HashMap = + aya::maps::HashMap::try_from(link_ids_map).map_err(|e| { + anyhow!( + "Cannot create link_ids HashMap from link_ids_map. Reason:{}", + e + ) + })?; + for interface in ifaces { match program.attach(&interface, TcAttachType::Ingress) { std::result::Result::Ok(link_id) => { @@ -149,10 +174,34 @@ async fn init_tc_classifier( "Program 'identity_classifier' attached to interface {}", interface ); - let mut map = link_ids - .lock() - .map_err(|e| anyhow::anyhow!("Cannot get value from lock. Reason: {}", e))?; - map.insert(interface.clone(), link_id); + let interface_bytes = interface.as_bytes(); + + let mut if_bytes = [0u8; 16]; + + // to set the len compare the interface_bytes.len() with the if_bytes.len() [16] and take the minimum + // if we have interface_bytes.len() < than 16 we set the len + let len = interface_bytes.len().min(if_bytes.len()); + + // now we can copy the bytes from the slice into the if_bytes variable + if_bytes[..len].copy_from_slice(&interface_bytes[..len]); + + // we compute the same process for the state_bytes + let mut state_bytes = [0u8; 8]; + let state = b"attached"; // prints "attached" as [u8;8] sequence of bytes + let state_len = state.len().min(state_bytes.len()); + state_bytes[..state_len].copy_from_slice(&state[..state_len]); + + match link_ids.insert(if_bytes, state_bytes, 0) { + std::result::Result::Ok(_) => { + info!("Veth interface {} added into map", &interface); + } + Err(e) => { + error!( + "Cannot add Veth interface {} into map. Reason: {}", + &interface, e + ); + } + } } Err(e) => error!( "Error attaching program to interface {}: {:?}", From 8599b901b4700f476eced98ceb93e24afba6baf4 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sat, 14 Feb 2026 13:53:34 +0100 Subject: [PATCH 09/17] [#182]: Added GetTrackedVethFromHashMap grpc endpoint to see the tracked veths (pt.2) --- cli/src/monitoring.rs | 33 +++++++------- core/api/Cargo.toml | 3 +- core/api/protos/agent.proto | 13 +++++- core/api/src/agent.rs | 90 ++++++++++++++++++++++++++++++++++++- core/api/src/api.rs | 53 +++++++++++++++++++--- core/api/src/requests.rs | 10 +++++ 6 files changed, 174 insertions(+), 28 deletions(-) diff --git a/cli/src/monitoring.rs b/cli/src/monitoring.rs index 72a94b8..eefae1c 100644 --- a/cli/src/monitoring.rs +++ b/cli/src/monitoring.rs @@ -10,7 +10,7 @@ use tonic_reflection::pb::v1::server_reflection_response::MessageResponse; use agent_api::client::{connect_to_client, connect_to_server_reflection}; use agent_api::requests::{ get_all_features, send_active_connection_request, send_dropped_packets_request, - send_latency_metrics_request, send_tracked_veth_request, + send_latency_metrics_request, send_tracked_veth_request, send_veth_tracked_hashmap_req, }; use crate::errors::CliError; @@ -304,25 +304,24 @@ pub async fn monitor_tracked_veth() -> Result<(), CliError> { "Connecting to cortexflow Client".white() ); match connect_to_client().await { - Ok(client) => match send_tracked_veth_request(client).await { + Ok(client) => match send_veth_tracked_hashmap_req(client).await { Ok(response) => { let veth_response = response.into_inner(); - if veth_response.tot_monitored_veth == 0 { - println!("{} {} ", "=====>".blue().bold(), "No tracked veth found"); - Ok(()) - } else { - println!( - "{} {} {} {} ", - "=====>".blue().bold(), - "Found:", - &veth_response.tot_monitored_veth, - "tracked veth" - ); - for veth in veth_response.veth_names.iter() { - println!("{} {}", "=====>".blue().bold(), &veth); - } - Ok(()) + // if veth_response.tot_monitored_veth == 0 { + // println!("{} {} ", "=====>".blue().bold(), "No tracked veth found"); + // Ok(()) + // } else { + // println!( + // "{} {} {} {} ", + // "=====>".blue().bold(), + // "Found:", + // &veth_response.tot_monitored_veth, + // "tracked veth" + // ); + for veth in veth_response.veths.iter() { + println!("{} {:?}", "=====>".blue().bold(), &veth); } + Ok(()) } Err(e) => { return Err(CliError::AgentError( diff --git a/core/api/Cargo.toml b/core/api/Cargo.toml index a422fd7..0070430 100644 --- a/core/api/Cargo.toml +++ b/core/api/Cargo.toml @@ -32,7 +32,8 @@ aya = "0.13.1" cortexbrain-common = { path = "../common", features = [ "map-handlers", "network-structs", - "buffer-reader" + "buffer-reader", + "monitoring-structs" ] } tonic-reflection = "0.14.0" tonic-build = "0.14.0" diff --git a/core/api/protos/agent.proto b/core/api/protos/agent.proto index 9bfc6e4..e2b1500 100644 --- a/core/api/protos/agent.proto +++ b/core/api/protos/agent.proto @@ -84,7 +84,13 @@ message VethEvent{ uint32 pid = 6; // Process ID } -//declare agent api +message VethHashMapResponse{ // returns tracked veth from the tracked_veth hashmap + string status = 1; + map veths = 2; +} + +// Agent Service + service Agent{ // active connections endpoint rpc ActiveConnections(RequestActiveConnections) returns (ActiveConnectionResponse); @@ -102,10 +108,15 @@ service Agent{ // dropped packets endpoint rpc GetDroppedPacketsMetrics(google.protobuf.Empty) returns (DroppedPacketsResponse); + // TODO: can i combine this 2 endpoints? // active veth info endpoint rpc GetTrackedVeth(google.protobuf.Empty) returns (VethResponse); + // get tracked veth from blocklist + rpc GetTrackedVethFromHashMap(google.protobuf.Empty) returns (VethHashMapResponse); } +// Blocklist + message AddIpToBlocklistRequest{ optional string ip = 1 ; } diff --git a/core/api/src/agent.rs b/core/api/src/agent.rs index cb93ddd..259c1ab 100644 --- a/core/api/src/agent.rs +++ b/core/api/src/agent.rs @@ -151,6 +151,17 @@ pub struct VethEvent { #[prost(uint32, tag = "6")] pub pid: u32, } +/// returns tracked veth from the tracked_veth hashmap +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct VethHashMapResponse { + #[prost(string, tag = "1")] + pub status: ::prost::alloc::string::String, + #[prost(map = "string, string", tag = "2")] + pub veths: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, +} #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct AddIpToBlocklistRequest { #[prost(string, optional, tag = "1")] @@ -192,7 +203,6 @@ pub mod agent_client { )] use tonic::codegen::*; use tonic::codegen::http::Uri; - /// declare agent api #[derive(Debug, Clone)] pub struct AgentClient { inner: tonic::client::Grpc, @@ -444,6 +454,31 @@ pub mod agent_client { .insert(GrpcMethod::new("agent.Agent", "GetTrackedVeth")); self.inner.unary(req, path, codec).await } + /// get tracked veth from blocklist + pub async fn get_tracked_veth_from_hash_map( + &mut self, + request: impl tonic::IntoRequest<()>, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/agent.Agent/GetTrackedVethFromHashMap", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("agent.Agent", "GetTrackedVethFromHashMap")); + self.inner.unary(req, path, codec).await + } } } /// Generated server implementations. @@ -511,8 +546,15 @@ pub mod agent_server { &self, request: tonic::Request<()>, ) -> std::result::Result, tonic::Status>; + /// get tracked veth from blocklist + async fn get_tracked_veth_from_hash_map( + &self, + request: tonic::Request<()>, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; } - /// declare agent api #[derive(Debug)] pub struct AgentServer { inner: Arc, @@ -885,6 +927,50 @@ pub mod agent_server { }; Box::pin(fut) } + "/agent.Agent/GetTrackedVethFromHashMap" => { + #[allow(non_camel_case_types)] + struct GetTrackedVethFromHashMapSvc(pub Arc); + impl tonic::server::UnaryService<()> + for GetTrackedVethFromHashMapSvc { + type Response = super::VethHashMapResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call(&mut self, request: tonic::Request<()>) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_tracked_veth_from_hash_map( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = GetTrackedVethFromHashMapSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } _ => { Box::pin(async move { let mut response = http::Response::new( diff --git a/core/api/src/api.rs b/core/api/src/api.rs index 79b9df3..405f805 100644 --- a/core/api/src/api.rs +++ b/core/api/src/api.rs @@ -1,8 +1,8 @@ use anyhow::Context; +use anyhow::anyhow; use chrono::Local; use cortexbrain_common::formatters::{format_ipv4, format_ipv6}; use cortexbrain_common::map_handlers::load_perf_event_array_from_mapdata; -use prost::bytes::BytesMut; use std::str::FromStr; use std::sync::Mutex; use tonic::{Request, Response, Status}; @@ -28,7 +28,8 @@ use cortexbrain_common::buffer_type::VethLog; // * contains agent api configuration use crate::agent::{ ActiveConnectionResponse, AddIpToBlocklistRequest, BlocklistResponse, RequestActiveConnections, - RmIpFromBlocklistRequest, RmIpFromBlocklistResponse, VethResponse, agent_server::Agent, + RmIpFromBlocklistRequest, RmIpFromBlocklistResponse, VethHashMapResponse, VethResponse, + agent_server::Agent, }; use crate::constants::PIN_BLOCKLIST_MAP_PATH; @@ -38,6 +39,9 @@ use cortexbrain_common::buffer_type::IpProtocols; use std::net::Ipv4Addr; use tracing::warn; +use cortexbrain_common::buffer_type::BufferSize; +use cortexbrain_common::map_handlers::map_manager; + pub struct AgentApi { //* event_rx is an istance of a mpsc receiver. //* is used to receive the data from the transmitter (tx) @@ -162,6 +166,9 @@ impl Default for AgentApi { tracked_veth_tx: veth_tx.clone(), }; + // init map manager + //let map_manager = map_manager(maps)? + // For network metrics //spawn an event readers @@ -177,7 +184,7 @@ impl Default for AgentApi { .open(cpu_id, None) .expect("Error during the creation of net_events_buf structure"); - let buffers = vec![BytesMut::with_capacity(4096); 8]; + let buffers = BufferSize::ClassifierNetEvents.set_buffer(); net_events_buffer.push((buf, buffers)); } @@ -262,7 +269,7 @@ impl Default for AgentApi { .open(cpu_id, None) .expect("Error during the creation of net_metrics_buf structure"); - let buffers = vec![BytesMut::with_capacity(4096); 8]; + let buffers = BufferSize::NetworkMetricsEvents.set_buffer(); net_metrics_buffer.push((buf, buffers)); } @@ -343,7 +350,7 @@ impl Default for AgentApi { .open(cpu_id, None) .expect("Error during the creation of time stamp events buf structure"); - let buffers = vec![BytesMut::with_capacity(4096); 8]; + let buffers = BufferSize::TimeMetricsEvents.set_buffer(); ts_events_buffer.push((buf, buffers)); } @@ -421,7 +428,7 @@ impl Default for AgentApi { .open(cpu_id, None) .expect("Error during the creation of time stamp events buf structure"); - let buffers = vec![BytesMut::with_capacity(4096); 8]; + let buffers = BufferSize::VethEvents.set_buffer(); veth_events_buffer.push((buf, buffers)); } @@ -560,7 +567,10 @@ impl Agent for AgentApi { //convert ip from string to [u8;4] type and insert into the bpf map let u8_4_ip = Ipv4Addr::from_str(&ip).unwrap().octets(); //TODO: convert datetime in a kernel compatible format - blocklist_map.insert(u8_4_ip, u8_4_ip, 0); + blocklist_map + .insert(u8_4_ip, u8_4_ip, 0) + .map_err(|e| anyhow!("Cannot insert address in the blocklist. Reason: {}", e)) + .unwrap(); info!("CURRENT BLOCKLIST: {:?}", blocklist_map); } let path = std::env::var(PIN_BLOCKLIST_MAP_PATH) @@ -774,4 +784,33 @@ impl Agent for AgentApi { Ok(Response::new(response)) } + + async fn get_tracked_veth_from_hash_map( + &self, + request: Request<()>, + ) -> Result, Status> { + info!("Returning veth hashmap"); + //open blocklist map + let mapdata = MapData::from_pin("/sys/fs/bpf/maps/tracked_veth") + .expect("cannot open tracked_veth Mapdata"); + let tracked_veth_mapdata = Map::HashMap(mapdata); //load mapdata + + let tracked_veth_map: ayaHashMap = + ayaHashMap::try_from(tracked_veth_mapdata).unwrap(); + + //convert the maps with a buffer to match the protobuffer types + + let mut converted_tracked_veth_map: HashMap = HashMap::new(); + for item in tracked_veth_map.iter() { + let (k, v) = item.unwrap(); + // convert keys and values from [u8;4] to String + let key = String::from_utf8(k.to_vec()).unwrap(); + let value = String::from_utf8(v.to_vec()).unwrap(); + converted_tracked_veth_map.insert(key, value); + } + Ok(Response::new(VethHashMapResponse { + status: "success".to_string(), + veths: converted_tracked_veth_map, + })) + } } diff --git a/core/api/src/requests.rs b/core/api/src/requests.rs index 06a4030..7c9f447 100644 --- a/core/api/src/requests.rs +++ b/core/api/src/requests.rs @@ -14,6 +14,7 @@ use crate::agent::LatencyMetricsResponse; use crate::agent::RequestActiveConnections; use crate::agent::RmIpFromBlocklistRequest; use crate::agent::RmIpFromBlocklistResponse; +use crate::agent::VethHashMapResponse; use crate::agent::VethResponse; use crate::agent::agent_client::AgentClient; @@ -100,3 +101,12 @@ pub async fn send_tracked_veth_request( let response = client.get_tracked_veth(request).await?; Ok(response) } + +#[cfg(feature = "client")] +pub async fn send_veth_tracked_hashmap_req( + mut client: AgentClient, +) -> Result, Error> { + let request = Request::new(()); + let response = client.get_tracked_veth_from_hash_map(request).await?; + Ok(response) +} From 4d23d4cd3953cb77ba62689ef2856c6b478fab45 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 2 Mar 2026 17:58:20 +0100 Subject: [PATCH 10/17] [#158]: removed struct.rs Added fill_buffers in buffer_type.rs. Simplified buffers filling and buffer initialization --- core/api/src/agent.rs | 2 + core/api/src/api.rs | 234 +++++++++------------------------ core/api/src/lib.rs | 1 - core/api/src/main.rs | 1 - core/api/src/structs.rs | 48 ------- core/common/src/buffer_type.rs | 22 ++++ 6 files changed, 89 insertions(+), 219 deletions(-) delete mode 100644 core/api/src/structs.rs diff --git a/core/api/src/agent.rs b/core/api/src/agent.rs index 259c1ab..8d004b9 100644 --- a/core/api/src/agent.rs +++ b/core/api/src/agent.rs @@ -432,6 +432,7 @@ pub mod agent_client { .insert(GrpcMethod::new("agent.Agent", "GetDroppedPacketsMetrics")); self.inner.unary(req, path, codec).await } + /// TODO: can i combine this 2 endpoints? /// active veth info endpoint pub async fn get_tracked_veth( &mut self, @@ -541,6 +542,7 @@ pub mod agent_server { tonic::Response, tonic::Status, >; + /// TODO: can i combine this 2 endpoints? /// active veth info endpoint async fn get_tracked_veth( &self, diff --git a/core/api/src/api.rs b/core/api/src/api.rs index 405f805..ba25101 100644 --- a/core/api/src/api.rs +++ b/core/api/src/api.rs @@ -1,6 +1,11 @@ use anyhow::Context; use anyhow::anyhow; +use aya::maps::perf::PerfEventArrayBuffer; use chrono::Local; +use cortexbrain_common::buffer_type::IpProtocols; +use cortexbrain_common::buffer_type::NetworkMetrics; +use cortexbrain_common::buffer_type::PacketLog; +use cortexbrain_common::buffer_type::TimeStampMetrics; use cortexbrain_common::formatters::{format_ipv4, format_ipv6}; use cortexbrain_common::map_handlers::load_perf_event_array_from_mapdata; use std::str::FromStr; @@ -8,7 +13,7 @@ use std::sync::Mutex; use tonic::{Request, Response, Status}; use tracing::info; -use aya::{maps::MapData, util::online_cpus}; +use aya::maps::MapData; use std::result::Result::Ok; use tonic::async_trait; @@ -22,7 +27,6 @@ use crate::agent::{ LatencyMetricsResponse, VethEvent, }; -use crate::structs::{NetworkMetrics, PacketLog, TimeStampMetrics}; use cortexbrain_common::buffer_type::VethLog; // * contains agent api configuration @@ -35,97 +39,23 @@ use crate::constants::PIN_BLOCKLIST_MAP_PATH; use crate::helpers::comm_to_string; use aya::maps::Map; -use cortexbrain_common::buffer_type::IpProtocols; use std::net::Ipv4Addr; use tracing::warn; use cortexbrain_common::buffer_type::BufferSize; -use cortexbrain_common::map_handlers::map_manager; +use cortexbrain_common::buffer_type::fill_buffers; pub struct AgentApi { //* event_rx is an istance of a mpsc receiver. //* is used to receive the data from the transmitter (tx) active_connection_event_rx: Mutex, Status>>>, - active_connection_event_tx: mpsc::Sender, Status>>, + pub(crate) active_connection_event_tx: mpsc::Sender, Status>>, latency_metrics_rx: Mutex, Status>>>, - latency_metrics_tx: mpsc::Sender, Status>>, + pub(crate) latency_metrics_tx: mpsc::Sender, Status>>, dropped_packet_metrics_rx: Mutex, Status>>>, - dropped_packet_metrics_tx: mpsc::Sender, Status>>, + pub(crate) dropped_packet_metrics_tx: mpsc::Sender, Status>>, tracked_veth_rx: Mutex, Status>>>, - tracked_veth_tx: mpsc::Sender, Status>>, -} - -//* Event sender trait. Takes an event from a map and send that to the mpsc channel -//* using the send_map function -#[async_trait] -pub trait EventSender: Send + Sync + 'static { - async fn send_active_connection_event(&self, event: Vec); - async fn send_active_connection_event_map( - &self, - map: Vec, - tx: mpsc::Sender, Status>>, - ) { - let status = Status::new(tonic::Code::Ok, "success"); - let event = Ok(map); - - let _ = tx.send(event).await; - } - - async fn send_latency_metrics_event(&self, event: Vec); - async fn send_latency_metrics_event_map( - &self, - map: Vec, - tx: mpsc::Sender, Status>>, - ) { - let status = Status::new(tonic::Code::Ok, "success"); - let event = Ok(map); - let _ = tx.send(event).await; - } - - async fn send_dropped_packet_metrics_event(&self, event: Vec); - async fn send_dropped_packet_metrics_event_map( - &self, - map: Vec, - tx: mpsc::Sender, Status>>, - ) { - let status = Status::new(tonic::Code::Ok, "success"); - let event = Ok(map); - let _ = tx.send(event).await; - } - - async fn send_tracked_veth_event(&self, event: Vec); - async fn send_tracked_veth_event_map( - &self, - map: Vec, - tx: mpsc::Sender, Status>>, - ) { - let status = Status::new(tonic::Code::Ok, "success"); - let event = Ok(map); - let _ = tx.send(event).await; - } -} - -// send event function. takes an HashMap and send that using mpsc event_tx -#[async_trait] -impl EventSender for AgentApi { - async fn send_active_connection_event(&self, event: Vec) { - self.send_active_connection_event_map(event, self.active_connection_event_tx.clone()) - .await; - } - - async fn send_latency_metrics_event(&self, event: Vec) { - self.send_latency_metrics_event_map(event, self.latency_metrics_tx.clone()) - .await; - } - - async fn send_dropped_packet_metrics_event(&self, event: Vec) { - self.send_dropped_packet_metrics_event_map(event, self.dropped_packet_metrics_tx.clone()) - .await; - } - async fn send_tracked_veth_event(&self, event: Vec) { - self.send_tracked_veth_event_map(event, self.tracked_veth_tx.clone()) - .await; - } + pub(crate) tracked_veth_tx: mpsc::Sender, Status>>, } //initialize a default trait for AgentApi. Loads a name and a bpf istance. @@ -137,13 +67,13 @@ impl Default for AgentApi { // // TODO: in the future will be better to not use .unwrap() - let mut active_connection_events_array = + let active_connection_events_array = load_perf_event_array_from_mapdata("/sys/fs/bpf/maps/events_map").unwrap(); - let mut network_metrics_events_array = + let network_metrics_events_array = load_perf_event_array_from_mapdata("/sys/fs/bpf/trace_maps/net_metrics").unwrap(); - let mut time_stamp_events_array = + let time_stamp_events_array = load_perf_event_array_from_mapdata("/sys/fs/bpf/trace_maps/time_stamp_events").unwrap(); - let mut tracked_veth_events_array = + let tracked_veth_events_array = load_perf_event_array_from_mapdata("/sys/fs/bpf/maps/veth_identity_map").unwrap(); // @@ -155,6 +85,7 @@ impl Default for AgentApi { let (drop_tx, drop_rx) = mpsc::channel(2048); let (veth_tx, tracked_veth_rx) = mpsc::channel(1024); + // init the API to send the events from the agent to the CLI let api = AgentApi { active_connection_event_rx: conn_rx.into(), active_connection_event_tx: conn_tx.clone(), @@ -169,35 +100,42 @@ impl Default for AgentApi { // init map manager //let map_manager = map_manager(maps)? + // init the buffers + let mut net_events_buffers = BufferSize::TcpEvents.set_buffer(); + let mut net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); + let mut ts_metrics_buffers = BufferSize::TimeMetricsEvents.set_buffer(); + let mut veth_metrics_buffers = BufferSize::VethEvents.set_buffer(); + + // init the Vec of Buffers + + let mut net_events_vec_buffer = Vec::>::new(); + let mut net_metrics_vec_buffer = Vec::>::new(); + let mut ts_events_vec_buffer = Vec::>::new(); + let mut veth_events_vec_buffer = Vec::>::new(); + + // fill the Vec of Buffers + + net_events_vec_buffer = fill_buffers(net_events_vec_buffer, active_connection_events_array); + net_metrics_vec_buffer = fill_buffers(net_metrics_vec_buffer, network_metrics_events_array); + + ts_events_vec_buffer = fill_buffers(ts_events_vec_buffer, time_stamp_events_array); + + veth_events_vec_buffer = fill_buffers(veth_events_vec_buffer, tracked_veth_events_array); + // For network metrics //spawn an event readers task::spawn(async move { - let mut net_events_buffer = Vec::new(); - //scan the cpus to read the data - - for cpu_id in online_cpus() - .map_err(|e| anyhow::anyhow!("Error {:?}", e)) - .unwrap() - { - let buf = active_connection_events_array - .open(cpu_id, None) - .expect("Error during the creation of net_events_buf structure"); - - let buffers = BufferSize::ClassifierNetEvents.set_buffer(); - net_events_buffer.push((buf, buffers)); - } - info!("Starting event listener"); //send the data through a mpsc channel loop { - for (buf, buffers) in net_events_buffer.iter_mut() { - match buf.read_events(buffers) { + for buf in net_events_vec_buffer.iter_mut() { + match buf.read_events(&mut net_events_buffers) { Ok(events) => { //read the events, this function is similar to the one used in identity/helpers.rs/display_events if events.read > 0 { for i in 0..events.read { - let data = &buffers[i]; + let data = &net_events_buffers[i]; if data.len() >= std::mem::size_of::() { let pl: PacketLog = unsafe { std::ptr::read(data.as_ptr() as *const _) }; @@ -258,32 +196,17 @@ impl Default for AgentApi { }); task::spawn(async move { - let mut net_metrics_buffer = Vec::new(); - - //scan the cpus to read the data - for cpu_id in online_cpus() - .map_err(|e| anyhow::anyhow!("Error {:?}", e)) - .unwrap() - { - let buf = network_metrics_events_array - .open(cpu_id, None) - .expect("Error during the creation of net_metrics_buf structure"); - - let buffers = BufferSize::NetworkMetricsEvents.set_buffer(); - net_metrics_buffer.push((buf, buffers)); - } - info!("Starting network metrics listener"); //send the data through a mpsc channel loop { - for (buf, buffers) in net_metrics_buffer.iter_mut() { - match buf.read_events(buffers) { + for buf in net_metrics_vec_buffer.iter_mut() { + match buf.read_events(&mut net_metrics_buffers) { Ok(events) => { //read the events, this function is similar to the one used in identity/helpers.rs/display_events if events.read > 0 { for i in 0..events.read { - let data = &buffers[i]; + let data = &net_metrics_buffers[i]; if data.len() >= std::mem::size_of::() { let nm: NetworkMetrics = unsafe { std::ptr::read(data.as_ptr() as *const _) }; @@ -340,34 +263,22 @@ impl Default for AgentApi { }); task::spawn(async move { - let mut ts_events_buffer = Vec::new(); - //scan the cpus to read the data - for cpu_id in online_cpus() - .map_err(|e| anyhow::anyhow!("Error {:?}", e)) - .unwrap() - { - let buf = time_stamp_events_array - .open(cpu_id, None) - .expect("Error during the creation of time stamp events buf structure"); - - let buffers = BufferSize::TimeMetricsEvents.set_buffer(); - ts_events_buffer.push((buf, buffers)); - } - info!("Starting time stamp events listener"); //send the data through a mpsc channel loop { - for (buf, buffers) in ts_events_buffer.iter_mut() { - match buf.read_events(buffers) { + for buf in ts_events_vec_buffer.iter_mut() { + match buf.read_events(&mut ts_metrics_buffers) { Ok(events) => { //read the events, this function is similar to the one used in identity/helpers.rs/display_events if events.read > 0 { for i in 0..events.read { - let data = &buffers[i]; + let data = &ts_metrics_buffers[i]; if data.len() >= std::mem::size_of::() { let tsm: TimeStampMetrics = unsafe { std::ptr::read(data.as_ptr() as *const _) }; + let saddr_v6 = tsm.saddr_v6; + let daddr_v6 = tsm.daddr_v6; let latency_metric = LatencyMetric { delta_us: tsm.delta_us, timestamp_us: tsm.ts_us, @@ -378,8 +289,8 @@ impl Default for AgentApi { address_family: tsm.af as u32, src_address_v4: format_ipv4(tsm.saddr_v4), dst_address_v4: format_ipv4(tsm.daddr_v4), - src_address_v6: format_ipv6(&tsm.saddr_v6), - dst_address_v6: format_ipv6(&tsm.daddr_v6), + src_address_v6: format_ipv6(&saddr_v6), + dst_address_v6: format_ipv6(&daddr_v6), }; info!( "Latency Metric - tgid: {}, process_name: {}, delta_us: {}, timestamp_us: {}, local_port: {}, remote_port: {}, address_family: {}, src_address_v4: {}, dst_address_v4: {}, src_address_v6: {}, dst_address_v6: {}", @@ -416,34 +327,19 @@ impl Default for AgentApi { } }); - // TODO: this part needs a better implementation task::spawn(async move { - let mut veth_events_buffer = Vec::new(); - //scan the cpus to read the data - for cpu_id in online_cpus() - .map_err(|e| anyhow::anyhow!("Error {:?}", e)) - .unwrap() - { - let buf = tracked_veth_events_array - .open(cpu_id, None) - .expect("Error during the creation of time stamp events buf structure"); - - let buffers = BufferSize::VethEvents.set_buffer(); - veth_events_buffer.push((buf, buffers)); - } - info!("Starting time stamp events listener"); //send the data through a mpsc channel loop { - for (buf, buffers) in veth_events_buffer.iter_mut() { - match buf.read_events(buffers) { + for buf in veth_events_vec_buffer.iter_mut() { + match buf.read_events(&mut veth_metrics_buffers) { Ok(events) => { //read the events, this function is similar to the one used in identity/helpers.rs/display_events if events.read > 0 { for i in 0..events.read { info!("Found veth events {}", events.read); - let data = &buffers[i]; + let data = &veth_metrics_buffers[i]; if data.len() >= std::mem::size_of::() { let veth: VethLog = unsafe { std::ptr::read(data.as_ptr() as *const _) }; @@ -515,7 +411,7 @@ impl Agent for AgentApi { request: Request, ) -> Result, Status> { //read request - let req = request.into_inner(); + let _req = request.into_inner(); //create the hashmap to process events from the mpsc channel queue let mut aggregated_events: Vec = Vec::new(); @@ -562,7 +458,7 @@ impl Agent for AgentApi { } else { // add ip to the blocklist // log blocklist event - let datetime = Local::now().to_string(); + let _datetime = Local::now().to_string(); let ip = req.ip.unwrap(); //convert ip from string to [u8;4] type and insert into the bpf map let u8_4_ip = Ipv4Addr::from_str(&ip).unwrap().octets(); @@ -573,14 +469,14 @@ impl Agent for AgentApi { .unwrap(); info!("CURRENT BLOCKLIST: {:?}", blocklist_map); } - let path = std::env::var(PIN_BLOCKLIST_MAP_PATH) + let _path = std::env::var(PIN_BLOCKLIST_MAP_PATH) .context("Blocklist map path not found!") .unwrap(); //convert the maps with a buffer to match the protobuffer types let mut converted_blocklist_map: HashMap = HashMap::new(); for item in blocklist_map.iter() { - let (k, v) = item.unwrap(); + let (k, _v) = item.unwrap(); // convert keys and values from [u8;4] to String let key = Ipv4Addr::from(k).to_string(); let value = Ipv4Addr::from(k).to_string(); @@ -596,7 +492,7 @@ impl Agent for AgentApi { async fn check_blocklist( &self, - request: Request<()>, + _request: Request<()>, ) -> Result, Status> { info!("Returning blocklist hashmap"); //open blocklist map @@ -611,7 +507,7 @@ impl Agent for AgentApi { let mut converted_blocklist_map: HashMap = HashMap::new(); for item in blocklist_map.iter() { - let (k, v) = item.unwrap(); + let (k, _v) = item.unwrap(); // convert keys and values from [u8;4] to String let key = Ipv4Addr::from(k).to_string(); let value = Ipv4Addr::from(k).to_string(); @@ -638,7 +534,7 @@ impl Agent for AgentApi { //remove the address let ip_to_remove = req.ip; let u8_4_ip_to_remove = Ipv4Addr::from_str(&ip_to_remove).unwrap().octets(); - blocklist_map.remove(&u8_4_ip_to_remove); + let _ = blocklist_map.remove(&u8_4_ip_to_remove); //convert the maps with a buffer to match the protobuffer types let mut converted_blocklist_map: HashMap = HashMap::new(); @@ -661,7 +557,7 @@ impl Agent for AgentApi { request: Request<()>, ) -> Result, Status> { // Extract the request parameters - let req = request.into_inner(); + let _req = request.into_inner(); info!("Getting latency metrics"); // Here you would typically query your data source for the latency metrics @@ -724,7 +620,7 @@ impl Agent for AgentApi { request: Request<()>, ) -> Result, Status> { // Extract the request parameters - let req = request.into_inner(); + let _req = request.into_inner(); info!("Getting dropped packets metrics"); let mut aggregated_dropped_packet_metrics: Vec = Vec::new(); @@ -759,7 +655,7 @@ impl Agent for AgentApi { &self, request: Request<()>, ) -> Result, Status> { - let req = request.into_inner(); + let _req = request.into_inner(); info!("Getting tracked veth metrics"); let mut tracked_veth = Vec::::new(); let mut tot_veth = 0 as i32; @@ -787,7 +683,7 @@ impl Agent for AgentApi { async fn get_tracked_veth_from_hash_map( &self, - request: Request<()>, + _request: Request<()>, ) -> Result, Status> { info!("Returning veth hashmap"); //open blocklist map diff --git a/core/api/src/lib.rs b/core/api/src/lib.rs index cf2c0c9..e093920 100644 --- a/core/api/src/lib.rs +++ b/core/api/src/lib.rs @@ -2,7 +2,6 @@ pub mod api; pub mod agent; pub mod client; pub mod requests; -pub mod structs; pub mod constants; pub mod helpers; pub mod batcher; diff --git a/core/api/src/main.rs b/core/api/src/main.rs index 30fe550..87478f5 100644 --- a/core/api/src/main.rs +++ b/core/api/src/main.rs @@ -6,7 +6,6 @@ mod agent; mod api; mod constants; mod helpers; -mod structs; mod agent_proto { use tonic::include_file_descriptor_set; diff --git a/core/api/src/structs.rs b/core/api/src/structs.rs deleted file mode 100644 index 97a4017..0000000 --- a/core/api/src/structs.rs +++ /dev/null @@ -1,48 +0,0 @@ -use bytemuck_derive::Zeroable; -use crate::constants::TASK_COMM_LEN; - - -#[repr(C)] -#[derive(Clone, Copy, Zeroable)] -pub struct PacketLog { - pub proto: u8, - pub src_ip: u32, - pub src_port: u16, - pub dst_ip: u32, - pub dst_port: u16, - pub pid: u32, -} -unsafe impl aya::Pod for PacketLog {} - -#[repr(C, packed)] -#[derive(Clone, Copy, Zeroable)] -pub struct NetworkMetrics { - pub tgid: u32, - pub comm: [u8; TASK_COMM_LEN], - pub ts_us: u64, - pub sk_err: i32, - pub sk_err_soft: i32, - pub sk_backlog_len: i32, - pub sk_write_memory_queued: i32, - pub sk_receive_buffer_size: i32, - pub sk_ack_backlog: u32, - pub sk_drops: i32, -} -unsafe impl aya::Pod for NetworkMetrics {} - -#[repr(C)] -#[derive(Clone, Copy, Zeroable)] -pub struct TimeStampMetrics { - pub delta_us: u64, - pub ts_us: u64, - pub tgid: u32, - pub comm: [u8; TASK_COMM_LEN], - pub lport: u16, - pub dport_be: u16, - pub af: u16, - pub saddr_v4: u32, - pub daddr_v4: u32, - pub saddr_v6: [u32; 4], - pub daddr_v6: [u32; 4], -} -unsafe impl aya::Pod for TimeStampMetrics {} diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index ac0d600..f962698 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -1,3 +1,5 @@ +#[cfg(feature = "buffer-reader")] +use aya::maps::{MapData, PerfEventArray}; use aya::{maps::perf::PerfEventArrayBuffer, util::online_cpus}; use bytemuck_derive::Zeroable; use bytes::BytesMut; @@ -560,3 +562,23 @@ impl BufferSize { } } } + +#[cfg(feature = "buffer-reader")] +pub fn fill_buffers( + //buf: PerfEventArrayBuffer, + mut vec_of_buffers: Vec>, + //buffers: Vec, + mut events_array: PerfEventArray, +) -> Vec> { + for cpu_id in online_cpus() + .map_err(|e| anyhow::anyhow!("Error {:?}", e)) + .unwrap() + { + let buf = events_array + .open(cpu_id, None) + .expect("Error during the creation of net_events_buf structure"); + + vec_of_buffers.push(buf); + } + vec_of_buffers +} From 76d462bfa4f10219375ad73ea611119f40657d4b Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 2 Mar 2026 18:02:40 +0100 Subject: [PATCH 11/17] [#158]: moved Event Sender trait in the batcher.rs module --- core/api/src/batcher.rs | 89 +++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 12 deletions(-) diff --git a/core/api/src/batcher.rs b/core/api/src/batcher.rs index 6e984d5..12d9278 100644 --- a/core/api/src/batcher.rs +++ b/core/api/src/batcher.rs @@ -1,22 +1,87 @@ // This module is experimental and may be subject to major changes. -use crate::agent::{ConnectionEvent, DroppedPacketMetric, LatencyMetric}; +// Do not use any of these functions +// FIXME: this module will be deprecated in the next version probably -pub enum MetricsBatcher { - LatencyMetrics, - DroppedPacketsMetrics, -} -pub enum EventBatcher {} -impl MetricsBatcher { - pub async fn send_batched_metrics() { - todo!(); +use tokio::sync::mpsc; +use tonic::{Status, async_trait}; + +use crate::{ + agent::{ConnectionEvent, DroppedPacketMetric, LatencyMetric, VethEvent}, + api::AgentApi, +}; + +// Event sender trait. Takes an event from a map and send that to the mpsc channel +// using the send_map function +#[async_trait] +pub trait EventSender: Send + Sync + 'static { + async fn send_active_connection_event(&self, event: Vec); + async fn send_active_connection_event_map( + &self, + map: Vec, + tx: mpsc::Sender, Status>>, + ) { + let status = Status::new(tonic::Code::Ok, "success"); + let event = Ok(map); + + let _ = tx.send(event).await; + } + + async fn send_latency_metrics_event(&self, event: Vec); + async fn send_latency_metrics_event_map( + &self, + map: Vec, + tx: mpsc::Sender, Status>>, + ) { + let status = Status::new(tonic::Code::Ok, "success"); + let event = Ok(map); + let _ = tx.send(event).await; + } + + async fn send_dropped_packet_metrics_event(&self, event: Vec); + async fn send_dropped_packet_metrics_event_map( + &self, + map: Vec, + tx: mpsc::Sender, Status>>, + ) { + let status = Status::new(tonic::Code::Ok, "success"); + let event = Ok(map); + let _ = tx.send(event).await; + } + + async fn send_tracked_veth_event(&self, event: Vec); + async fn send_tracked_veth_event_map( + &self, + map: Vec, + tx: mpsc::Sender, Status>>, + ) { + let status = Status::new(tonic::Code::Ok, "success"); + let event = Ok(map); + let _ = tx.send(event).await; } } -impl EventBatcher { - pub async fn send_batched_logs() { - todo!(); +// send event function. takes an HashMap and send that using mpsc event_tx +#[async_trait] +impl EventSender for AgentApi { + async fn send_active_connection_event(&self, event: Vec) { + self.send_active_connection_event_map(event, self.active_connection_event_tx.clone()) + .await; + } + + async fn send_latency_metrics_event(&self, event: Vec) { + self.send_latency_metrics_event_map(event, self.latency_metrics_tx.clone()) + .await; + } + + async fn send_dropped_packet_metrics_event(&self, event: Vec) { + self.send_dropped_packet_metrics_event_map(event, self.dropped_packet_metrics_tx.clone()) + .await; + } + async fn send_tracked_veth_event(&self, event: Vec) { + self.send_tracked_veth_event_map(event, self.tracked_veth_tx.clone()) + .await; } } From bf1720bab06a56e6d677097783f9dea2a5dbcd01 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 2 Mar 2026 18:26:28 +0100 Subject: [PATCH 12/17] [deprecated]: removed deprecated Scripts folder --- Scripts/check-cortexflow-components.sh | 21 -------- Scripts/check-dev-requisites.sh | 41 --------------- Scripts/install-debugging-tools.sh | 45 ---------------- Scripts/test-connections.sh | 49 ------------------ Scripts/test-proxy-endpoints.sh | 45 ---------------- Scripts/test-proxy-ports.sh | 18 ------- Scripts/test-sidecar-advanced-tcp.sh | 67 ------------------------ Scripts/test-sidecar-advanced-udp.sh | 70 ------------------------- Scripts/test-sidecar-proxy.sh | 71 -------------------------- 9 files changed, 427 deletions(-) delete mode 100755 Scripts/check-cortexflow-components.sh delete mode 100755 Scripts/check-dev-requisites.sh delete mode 100755 Scripts/install-debugging-tools.sh delete mode 100755 Scripts/test-connections.sh delete mode 100755 Scripts/test-proxy-endpoints.sh delete mode 100755 Scripts/test-proxy-ports.sh delete mode 100755 Scripts/test-sidecar-advanced-tcp.sh delete mode 100755 Scripts/test-sidecar-advanced-udp.sh delete mode 100755 Scripts/test-sidecar-proxy.sh diff --git a/Scripts/check-cortexflow-components.sh b/Scripts/check-cortexflow-components.sh deleted file mode 100755 index 01232cb..0000000 --- a/Scripts/check-cortexflow-components.sh +++ /dev/null @@ -1,21 +0,0 @@ -echo "Welcome to CortexFlow tools" -echo "Checking CortexFlow components" - -echo "Checking if CortexFlow namespace exists..." -if kubectl get namespace cortexflow >/dev/null 2>&1; then - echo "✅ Namespace 'cortexflow' exists." - - sleep 1 - echo "Checking pods..." - kubectl get pods -n cortexflow - - echo - - sleep 1 - echo "Checking services..." - kubectl get svc -n cortexflow - echo -else - echo "❌ Namespace 'cortexflow' does not exist." - exit 1 -fi diff --git a/Scripts/check-dev-requisites.sh b/Scripts/check-dev-requisites.sh deleted file mode 100755 index c775754..0000000 --- a/Scripts/check-dev-requisites.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -echo "Welcome to the CortexFlow tools" -echo "Checking pre-requisites for developers" -echo - -echo "Checking Docker installation..." -if which docker >/dev/null 2>&1; then - echo "✅ Docker is installed." -else - echo "❌ Docker is NOT installed." -fi -sleep 1 - -echo -echo "Checking Minikube installation..." -if which minikube >/dev/null 2>&1; then - echo "✅ Minikube is installed." -else - echo "❌ Minikube is NOT installed." -fi -sleep 1 - -echo - -echo "Checking Node.js installation..." -if which node >/dev/null 2>&1; then - echo "✅ Node.js is installed." -else - echo "Node.js is NOT installed." -fi -sleep 1 - -echo - -echo "Checking npm installation..." -if which npm >/dev/null 2>&1; then - echo "✅ npm is installed." -else - echo "❌ npm is NOT installed." -fi diff --git a/Scripts/install-debugging-tools.sh b/Scripts/install-debugging-tools.sh deleted file mode 100755 index 9e3ed01..0000000 --- a/Scripts/install-debugging-tools.sh +++ /dev/null @@ -1,45 +0,0 @@ -if ! kubectl exec -n cortexflow $1 -c $2 -- which netstat >/dev/null 2>&1; then - echo "🔨 installing netstat" - kubectl exec -n cortexflow $1 -c $2 -- apt update - kubectl exec -n cortexflow $1 -c $2 -- apt install -y net-tools -else - echo "✅ Netstat is installed." -fi - -sleep 1.5 - -if ! kubectl exec -n cortexflow $1 -c $2 -- which nc >/dev/null 2>&1; then - echo "🔨 installing netcat" - kubectl exec -n cortexflow $1 -c $2 -- apt install -y netcat-traditional -else - echo "✅ Netcat is installed." -fi - -sleep 1.5 - -if ! kubectl exec -n cortexflow $1 -c $2 -- which curl >/dev/null 2>&1; then - echo "🔨 installing curl" - kubectl exec -n cortexflow $1 -c $2 -- apt install -y curl -else - echo "✅ Curl is installed." -fi - -sleep 1.5 - -if ! kubectl exec -n cortexflow $1 -c $2 -- which nslookup >/dev/null 2>&1; then - echo "🔨 installing dnsutils" - kubectl exec -n cortexflow $1 -c $2 -- apt install -y dnsutils -else - echo "✅ Nslookup is installed." -fi - -sleep 1.5 - -if ! kubectl exec -n cortexflow $1 -c $2 -- which tcpdump >/dev/null 2>&1; then - echo "🔨 installing tcpdump" - kubectl exec -n cortexflow $1 -c $2 -- apt install -y tcpdump -else - echo "✅ tcpdump is installed." -fi - -sleep 1.5 diff --git a/Scripts/test-connections.sh b/Scripts/test-connections.sh deleted file mode 100755 index 95dcc94..0000000 --- a/Scripts/test-connections.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -proxy_pod_name=$(kubectl get pods -n cortexflow --no-headers -o custom-columns=":metadata.name" | grep cortexflow-proxy) -proxy_ip=$(kubectl get -o template service/proxy-service -n cortexflow --template='{{.spec.clusterIP}}') -proxy_udp_port=5053 -proxy_tcp_port=5054 -proxy_metrics_port=9090 -proxy_container=$(kubectl get pod $proxy_pod_name -n cortexflow -o jsonpath='{.spec.containers[*].name}') - -echo "🧑🏻‍🔬 Checking cortexflow proxy inside the proxy pod: $proxy_pod_name" - -sleep 1.5 -echo "🔨 checking env variables" -kubectl exec -n cortexflow $proxy_pod_name -- env - -sleep 1.5 - -./install-debugging-tools.sh $proxy_pod_name $proxy_container -echo -./test-proxy-ports.sh $proxy_pod_name $proxy_metrics_port -echo -sleep 1.5 -echo "🔨 Sending a test package with netcat from proxy pod -> proxy pod" -kubectl exec -n cortexflow $proxy_pod_name -- sh -c echo b"Hi CortexFlow" | nc -u -w5 -v 127.0.0.1 $proxy_udp_port - -echo -sleep 1.5 -echo "🔨 Testing the DNS resolution manually with nslookup" -kubectl exec -n cortexflow $proxy_pod_name -- nslookup proxy-service.cortexflow.svc.cluster.local - -sleep 1.5 -echo -./test-proxy-endpoints.sh $proxy_pod_name -echo -echo -echo "🧑🏻‍🔬 Testing outside the proxy pod using a test pod" -echo "🔨 Testing using a temporary test pod and nslookup" -kubectl run -it --rm --image=busybox test-pod --restart=Never -n cortexflow -- nslookup proxy-service.cortexflow.svc.cluster.local - -echo -sleep 1.5 -echo "🔨 Sending a test message using netcat and a temporary test pod" -kubectl run -it --rm --image=busybox test-pod --restart=Never -n cortexflow -- sh -c "echo -n Hi CortexFlow | nc -u -w 3 -v $proxy_ip $proxy_udp_port" - -echo -sleep 1.5 -echo "🔨 Testing the tcp port" -echo "🔨 Sending a test message using netcat and a temporary test pod " -kubectl run -it --rm --image=busybox test-pod --restart=Never -n cortexflow -- sh -c "echo -n Hi TCP | nc -w 3 -v $proxy_ip $proxy_tcp_port" diff --git a/Scripts/test-proxy-endpoints.sh b/Scripts/test-proxy-endpoints.sh deleted file mode 100755 index c89e52e..0000000 --- a/Scripts/test-proxy-endpoints.sh +++ /dev/null @@ -1,45 +0,0 @@ -echo "🔨 Testing curl command" -response=$(kubectl exec -n cortexflow $1 -- curl -s -o /dev/null -w "%{http_code}" http://localhost:9090/) -if [ "$response" -eq 200 ]; then - echo "✅ Server is working" - echo " Checking / endpoint" - kubectl exec -n cortexflow $1 -- curl -v http://localhost:9090/ -else - echo "❌ Error in http response ERROR: $response. Service does not exists or is not exposed" -fi - -echo -sleep 1.5 -echo "🔨 Testing /health endpoint" -response=$(kubectl exec -n cortexflow $1 -- curl -s -o /dev/null -w "%{http_code}" http://localhost:9090/health) -if [ "$response" -eq 200 ]; then - echo "✅ Server is working" - echo " Checking /health endpoint" - kubectl exec -n cortexflow $1 -- curl -v http://localhost:9090/health -else - echo "❌ Error in http response ERROR: $response. Service does not exists or is not exposed" -fi - -echo -sleep 1.5 -echo "🔨 Testing /metrics endpoint" -response=$(kubectl exec -n cortexflow $1 -- curl -s -o /dev/null -w "%{http_code}" http://localhost:9090/metrics) -if [ "$response" -eq 200 ]; then - echo "✅ Server is working" - echo " Checking /metrics endpoint" - kubectl exec -n cortexflow $1 -- curl -v http://localhost:9090/metrics -else - echo "❌ Error in http response ERROR: $response. Service does not exists or is not exposed" -fi - -echo -sleep 1.5 -echo "🔨 Testing /status endpoint" -response=$(kubectl exec -n cortexflow $1 -- curl -s -o /dev/null -w "%{http_code}" http://localhost:9090/status) -if [ "$response" -eq 200 ]; then - echo "✅ Server is working" - echo " Checking /status endpoint" - kubectl exec -n cortexflow $1 -- curl -v http://localhost:9090/status -else - echo "❌ Error in http response ERROR: $response. Service does not exists or is not exposed" -fi diff --git a/Scripts/test-proxy-ports.sh b/Scripts/test-proxy-ports.sh deleted file mode 100755 index 33d658d..0000000 --- a/Scripts/test-proxy-ports.sh +++ /dev/null @@ -1,18 +0,0 @@ -echo "🔨 Testing network connections" -kubectl exec -n cortexflow $1 -- netstat -tulnp | grep $2 - -sleep 1.5 - -echo -echo "🔨 testing if the process is in execution" -kubectl exec -n cortexflow $1 -- ps aux | grep cortexflow-proxy - -sleep 1.5 -echo -echo "🔨 testing using netcat" -kubectl exec -n cortexflow $1 -- nc -zv proxy-service.cortexflow.svc.cluster.local $2 - -sleep 1.5 -echo -echo "🔨 Checking if the proxy is listening in the 5053 port" -kubectl exec -n cortexflow $1 -- netstat -ulnp diff --git a/Scripts/test-sidecar-advanced-tcp.sh b/Scripts/test-sidecar-advanced-tcp.sh deleted file mode 100755 index ec3fce4..0000000 --- a/Scripts/test-sidecar-advanced-tcp.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/sh - -./install-debugging-tools.sh test-proxy proxy-sidecar -./install-debugging-tools.sh test-proxy2 proxy-sidecar -./install-debugging-tools.sh test-proxy3 proxy-sidecar -./install-debugging-tools.sh test-proxy4 proxy-sidecar - -# start the tcp listener -kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Starting TCP listener on port 5054..." - nohup sh -c "nc -l -p 5054" >/dev/null 2>&1 & -' - -kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Starting TCP listener on port 5054..." - nohup sh -c "nc -l -p 5054" >/dev/null 2>&1 & -' - - -test_proxy_to_proxy2() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -w1 test-proxy2 5054 - ' - done -} - -test_proxy2_to_proxy() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -w1 test-proxy 5054 - ' - done -} - -test_proxy3_to_proxy2() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - kubectl exec test-proxy3 -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -w1 test-proxy2 5054 - ' - done -} - -test_proxy4_to_proxy2() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - kubectl exec test-proxy4 -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -w1 test-proxy2 5054 - ' - done -} - -# execute the functions in background -test_proxy_to_proxy2 & -test_proxy2_to_proxy & -test_proxy3_to_proxy2 & -test_proxy4_to_proxy2 & - - -sleep 300 - -# stop the listeners -kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c 'pkill nc' -kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c 'pkill nc' diff --git a/Scripts/test-sidecar-advanced-udp.sh b/Scripts/test-sidecar-advanced-udp.sh deleted file mode 100755 index d9c52a8..0000000 --- a/Scripts/test-sidecar-advanced-udp.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/sh -./install-debugging-tools.sh test-proxy proxy-sidecar -./install-debugging-tools.sh test-proxy2 proxy-sidecar -./install-debugging-tools.sh test-proxy3 proxy-sidecar -./install-debugging-tools.sh test-proxy4 proxy-sidecar - -# start the udp listener -kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Starting UDP listener on port 5053..." - nohup nc -lu 5053 >/dev/null 2>&1 & -' - -kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Starting UDP listener on port 5053..." - nohup nc -lu 5053 >/dev/null 2>&1 & -' - - -test_proxy_to_proxy2() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - echo "Sending UDP packet from test-proxy to test-proxy2..." - kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -u -w1 test-proxy2 5053 - ' - done -} - -test_proxy2_to_proxy() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - echo "Sending UDP packet from test-proxy2 to test-proxy..." - kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -u -w1 test-proxy 5053 - ' - done -} - -test_proxy3_to_proxy2() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - echo "Sending UDP packet from test-proxy3 to test-proxy2..." - kubectl exec test-proxy3 -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -u -w1 test-proxy2 5053 - ' - done -} - -test_proxy4_to_proxy2() { - for i in $(seq 1 300); do - sleep $((RANDOM % 5 + 1)) - echo "Sending UDP packet from test-proxy4 to test-proxy2..." - kubectl exec test-proxy4 -c proxy-sidecar -n cortexflow -- sh -c ' - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -u -w1 test-proxy2 5053 - ' - done -} - -# execute the functions in background -(test_proxy_to_proxy2 &) & -(test_proxy2_to_proxy &) & -(test_proxy3_to_proxy2 &) & -(test_proxy4_to_proxy2 &) & - - -sleep 300 - -# stop the listeners -kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c 'pkill nc || kill $(pgrep nc)' -kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c 'pkill nc || kill $(pgrep nc)' diff --git a/Scripts/test-sidecar-proxy.sh b/Scripts/test-sidecar-proxy.sh deleted file mode 100755 index fcce42d..0000000 --- a/Scripts/test-sidecar-proxy.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -echo "Testing Sidecar proxy injection " - -sleep 1 -echo "Checking pods" -kubectl get pods -o wide -n cortexflow -echo -echo "Checking if the sidecar proxy is present" -kubectl get pods -n cortexflow -o json | jq '.items[].spec.containers[].name' - -echo -sleep 1 -echo "Checking open ports in test-proxy" -kubectl get pods test-proxy -o jsonpath='{.spec.containers[*].ports}' -n cortexflow -echo -kubectl get pods test-proxy2 -o jsonpath='{.spec.containers[*].ports}' -n cortexflow - -echo -echo -echo "Installing debugging tools in test-proxy: (PROXY-SIDECAR container)" -sleep 3 -./install-debugging-tools.sh test-proxy proxy-sidecar -echo -echo -echo "Installing debugging tools in test-proxy2: (PROXY-SIDECAR container)" -sleep 3 -./install-debugging-tools.sh test-proxy2 proxy-sidecar - -echo -echo -echo "Checking network connections in test-proxy pod " -kubectl exec -it test-proxy -c proxy-sidecar -n cortexflow -- netstat -tulnp -echo -echo "Checking network connections in test-proxy2 pod" -kubectl exec -it test-proxy2 -c proxy-sidecar -n cortexflow -- netstat -tulnp - - -echo -sleep 2 -echo "TEST 1: Checking if test-proxy can communicate with test-proxy2" -kubectl exec -it test-proxy -c proxy-sidecar -n cortexflow -- nc -zv test-proxy2.cortexflow.svc.cluster.local 5054 -echo - -echo - -echo "TEST 2: Checking if test-proxy can communicate with test-proxy2 (TCP)" - -# 2. Send the message from test-proxy to test-proxy2 -kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Test: Incoming Message ⏳" - printf "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJwYXlsb2FkIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}\n" | nc -w3 test-proxy2 5054 && echo "✅ Test completed" -' - -echo -sleep 2 -echo -echo "TEST 2: Sending a message from test-proxy to test-proxy2 (UDP)" - -#Start the UDP listener on test-proxy2 (MUST be before sending the message) -kubectl exec test-proxy2 -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Starting UDP listener on port 5053..." - nohup sh -c "nc -lu -p 5053 > /tmp/received_message.log" >/dev/null 2>&1 & - sleep 2 # Wait for the listener to start -' - -#2. Send the message from test-proxy to test-proxy2 -kubectl exec test-proxy -c proxy-sidecar -n cortexflow -- sh -c ' - echo "Test: Incoming Message ⏳" - echo "{\"service\":\"test-proxy2.cortexflow\",\"direction\":\"Incoming\",\"payload\":\"eyJtZXNzYWdlIjogIkhlbGxvIGZyb20gcHJveHktc2lkZWNhciJ9\"}" | nc -u -w3 test-proxy2 5053 && echo "✅ Test completed" -' From aa9f4383dbc925763debfb7882082ccbaa15c945 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 1 Jun 2026 19:11:21 +0200 Subject: [PATCH 13/17] [#175]: added modules to initialize the Metrics exporter using opentelemetry sdk. added srtruct Metrics to group all the metrics in one place. added auxiliary functions record_network_metrics and record_timestamp_metrics. added exporter setting in buffer_type/read_network_metrics and buffer_type/read_timestamp_metrics --- core/common/src/buffer_type.rs | 83 +++++++++++- core/common/src/lib.rs | 4 +- core/common/src/otel_metrics.rs | 133 +++++++++++++++++++ core/src/components/metrics/src/otel_init.rs | 120 +++++++++++++++++ 4 files changed, 333 insertions(+), 7 deletions(-) create mode 100644 core/common/src/otel_metrics.rs create mode 100644 core/src/components/metrics/src/otel_init.rs diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index f962698..45d82c8 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -1,9 +1,14 @@ +#[cfg(feature = "monitoring-structs")] +use crate::otel_metrics::Metrics; #[cfg(feature = "buffer-reader")] use aya::maps::{MapData, PerfEventArray}; use aya::{maps::perf::PerfEventArrayBuffer, util::online_cpus}; use bytemuck_derive::Zeroable; use bytes::BytesMut; use std::net::Ipv4Addr; +#[cfg(feature = "buffer-reader")] +#[cfg(feature = "monitoring-structs")] +use std::sync::Arc; use tracing::{error, info, warn}; // @@ -342,7 +347,39 @@ impl BufferType { } } #[cfg(feature = "monitoring-structs")] - pub async fn read_network_metrics(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { + /// Continuously read [`NetworkMetrics`] events and record OpenTelemetry + /// observations. + /// + /// This helper mirrors the core behaviour of + /// [`cortexbrain_common::buffer_type::read_perf_buffer`] but adds the OTel + /// instrumentation layer. + /// + /// # Loop + /// + /// 1. For every CPU buffer call `read_events`. + /// 2. Parse each raw [`BytesMut`] into [`NetworkMetrics`] using an + /// unaligned read (the struct is `#[repr(C, packed)]` and `Pod`). + /// 3. Call [`Metrics::record_network_metrics`]. + /// 4. Retain the legacy `tracing::info!` log for human-readable local output. + /// 5. Sleep 100 ms between polls. + /// + /// # Safety + /// + /// `std::ptr::read_unaligned` is safe here because the eBPF program writes + /// exactly the `NetworkMetrics` layout into the ring buffer and the struct + /// implements [`aya::Pod`]. + /// Continuously read [`TimeStampMetrics`] events and record OpenTelemetry + /// observations. + /// + /// Counterpart to [`read_network_buffer`] for the `time_stamp_events` map. + + pub async fn read_network_metrics( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + exporter: &str, + metrics: Arc, + ) { for i in offset..tot_events { let vec_bytes = &buffers[i as usize]; if vec_bytes.len() < std::mem::size_of::() { @@ -361,6 +398,11 @@ impl BufferType { if vec_bytes.len() >= std::mem::size_of::() { let net_metrics: NetworkMetrics = unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + match exporter { + "otlp" => metrics.record_network_metrics(&net_metrics), + _ => continue, // skip + } let tgid = net_metrics.tgid; let comm = String::from_utf8_lossy(&net_metrics.comm); let ts_us = net_metrics.ts_us; @@ -389,7 +431,13 @@ impl BufferType { } } #[cfg(feature = "monitoring-structs")] - pub async fn read_timestamp_metrics(buffers: &mut [BytesMut], tot_events: i32, offset: i32) { + pub async fn read_timestamp_metrics( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + exporter: &str, + metrics: Arc, + ) { for i in offset..tot_events { let vec_bytes = &buffers[i as usize]; if vec_bytes.len() < std::mem::size_of::() { @@ -408,6 +456,12 @@ impl BufferType { if vec_bytes.len() >= std::mem::size_of::() { let time_stamp_event: TimeStampMetrics = unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + match exporter { + "otlp" => metrics.record_timestamp_metrics(&time_stamp_event), + _ => continue, + } + let delta_us = time_stamp_event.delta_us; let ts_us = time_stamp_event.ts_us; let tgid = time_stamp_event.tgid; @@ -431,6 +485,7 @@ pub async fn read_perf_buffer>( mut array_buffers: Vec>, mut buffers: Vec, buffer_type: BufferType, + #[cfg(feature = "monitoring-structs")] metrics: Option>, ) { // loop over the buffers loop { @@ -469,13 +524,29 @@ pub async fn read_perf_buffer>( } #[cfg(feature = "monitoring-structs")] BufferType::NetworkMetrics => { - BufferType::read_network_metrics(&mut buffers, tot_events, offset) - .await + BufferType::read_network_metrics( + &mut buffers, + tot_events, + offset, + "otlp", + metrics + .clone() + .expect("Metrics required for NetworkMetrics"), + ) + .await } #[cfg(feature = "monitoring-structs")] BufferType::TimeStampMetrics => { - BufferType::read_timestamp_metrics(&mut buffers, tot_events, offset) - .await + BufferType::read_timestamp_metrics( + &mut buffers, + tot_events, + offset, + "otlp", + metrics + .clone() + .expect("Metric required for TimeStampMetrics"), + ) + .await } } } diff --git a/core/common/src/lib.rs b/core/common/src/lib.rs index d7e48b0..15c4ad7 100644 --- a/core/common/src/lib.rs +++ b/core/common/src/lib.rs @@ -1,7 +1,7 @@ #[cfg(any( feature = "buffer-reader", feature = "network-structs", - feature = "monitoring-structs" + feature = "monitoring-structs", ))] pub mod buffer_type; pub mod constants; @@ -9,5 +9,7 @@ pub mod formatters; pub mod logger; #[cfg(feature = "map-handlers")] pub mod map_handlers; +#[cfg(feature = "monitoring-structs")] +pub mod otel_metrics; #[cfg(feature = "program-handlers")] pub mod program_handlers; diff --git a/core/common/src/otel_metrics.rs b/core/common/src/otel_metrics.rs new file mode 100644 index 0000000..ae8c9db --- /dev/null +++ b/core/common/src/otel_metrics.rs @@ -0,0 +1,133 @@ +//! OpenTelemetry metric instruments for eBPF perf-buffer events. +//! +//! This module centralises every [`Meter`]-backed instrument that the +//! `metrics` crate uses to observe raw eBPF events. It provides a single +//! [`Metrics`] handle that is cheap to [`Arc`]-clone and safe to use from +//! multiple asynchronous tasks concurrently. +//! +//! - An [`Arc`] is moved into each Tokio +//! task that reads a perf buffer. All instrument operations are lock-free. +//! - Every observation is tagged with `tgid` and `comm` +//! extracted from the eBPF struct, allowing downstream collectors to group +//! telemetry by process. + +use crate::buffer_type::{NetworkMetrics, TimeStampMetrics}; +use opentelemetry::KeyValue; +use opentelemetry::metrics::{Counter, Gauge, Histogram, Meter}; +pub struct Metrics { + /// Total number of eBPF events processed across all perf buffers. + pub events_total: Counter, + + /// Total number of network-related events produced by the `net_metrics` + /// eBPF map. + pub packets_total: Counter, + + /// Observed socket drop count (`sk_drops`) from the kernel sock struct. + pub sk_drops: Gauge, + + /// Observed socket error count (`sk_err`) from the kernel sock struct. + pub sk_err: Gauge, + + /// Histogram of `delta_us` values supplied by the `time_stamp_events` + /// perf buffer. + pub delta_us: Histogram, + + /// Histogram of `ts_us` values seen in both `net_metrics` and + /// `time_stamp_events`. + pub ts_us: Histogram, +} + +impl Metrics { + /// Initialise all instruments backed by the supplied [`Meter`]. + pub fn new(meter: &Meter) -> Self { + // total events + let events_total = meter + .u64_counter("cortexbrain_events_total") + .with_description("Total number of eBPF events processed") + .build(); + + // total packets + let packets_total = meter + .u64_counter("cortexbrain_packets_total") + .with_description("Total number of network events processed") + .build(); + + // socket drops + let sk_drops = meter + .i64_gauge("cortexbrain_sk_drops") + .with_description("Socket drop count per event") + .build(); + + // socket errors + let sk_err = meter + .i64_gauge("cortexbrain_sk_err") + .with_description("Socket error count per event") + .build(); + + // delta microseconds + let delta_us = meter + .u64_histogram("cortexbrain_delta_us") + .with_description("Distribution of delta_us values from timestamp events") + .build(); + + // timestamp microseconds grouped + let ts_us = meter + .u64_histogram("cortexbrain_ts_us") + .with_description("Distribution of timestamp values from eBPF events") + .build(); + + Self { + events_total, + packets_total, + sk_drops, + sk_err, + delta_us, + ts_us, + } + } + + /// Record a single [`NetworkMetrics`] event. + /// + /// Increments `events_total` and `packets_total`, records `sk_drops` and + /// `sk_err` as gauges, and observes `ts_us` in the timestamp histogram. + /// + /// Every observation carries: + /// + /// -`tgid` – task group ID. + /// - `comm` – command name (null-terminated bytes converted to a UTF-8 + /// string and trimmed). + pub fn record_network_metrics(&self, m: &NetworkMetrics) { + let comm = String::from_utf8_lossy(&m.comm); + let comm_trimmed = comm.trim_end_matches('\0').to_string(); + let attrs = &[ + KeyValue::new("tgid", m.tgid as i64), + KeyValue::new("comm", comm_trimmed), + ]; + + self.events_total.add(1, attrs); + self.packets_total.add(1, attrs); + self.sk_drops.record(m.sk_drops as i64, attrs); + self.sk_err.record(m.sk_err as i64, attrs); + self.ts_us.record(m.ts_us, attrs); + } + + /// Record a single [`TimeStampMetrics`] event. + /// + /// Increments `events_total`, and records `delta_us` and `ts_us` in their + /// respective histograms. + /// + /// Every observation carries `tgid` and `comm` (see + /// [`record_network_metrics`]). + pub fn record_timestamp_metrics(&self, m: &TimeStampMetrics) { + let comm = String::from_utf8_lossy(&m.comm); + let comm_trimmed = comm.trim_end_matches('\0').to_string(); + let attrs = &[ + KeyValue::new("tgid", m.tgid as i64), + KeyValue::new("comm", comm_trimmed), + ]; + + self.events_total.add(1, attrs); + self.delta_us.record(m.delta_us, attrs); + self.ts_us.record(m.ts_us, attrs); + } +} diff --git a/core/src/components/metrics/src/otel_init.rs b/core/src/components/metrics/src/otel_init.rs new file mode 100644 index 0000000..e472c7e --- /dev/null +++ b/core/src/components/metrics/src/otel_init.rs @@ -0,0 +1,120 @@ +//! docs +//! This module configures and bootstraps the OpenTelemetry SDK (OTel SDK) +//! within the `metrics` binary. Its goal is to expose a [`Meter`] --- the +//! primary entry-point for creating counters, gauges and histograms --- +//! backed by an **OTLP/gRPC** metric exporter. +//! +//! # Relationship to the rest of the crate +//! +//! `otel_init::init_opentelemetry()` is invoked **once** in [`main`], before +//! any eBPF program is loaded. The returned [`Meter`] is then passed through +//! the call chain into [`event_listener`](crate::helpers::event_listener) +//! where it is used by the async tasks that read eBPF perf-buffers. See +//! [`crate::helpers`] for the consumption side. +//! +//! When the application exits (either because `Ctrl-C` was received or because +//! an error bubbled up), [`shutdown_opentelemetry`] is called. This flushes +//! every remaining aggregated metric to the OTLP collector before the process +//! terminates. +//! + +use opentelemetry::global; +use opentelemetry::metrics::{Meter, MeterProvider}; +use opentelemetry_otlp::{MetricExporter, WithExportConfig}; +use opentelemetry_sdk::metrics::{PeriodicReader, SdkMeterProvider}; +use std::env; +use std::sync::OnceLock; +use std::time::Duration; + +/// Environment variable that holds the OTLP collector endpoint. +/// +/// Expected format: `"http://collector:4317"` (gRPC transport). +/// +pub const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; + +/// Default OTLP endpoint used when [`OTEL_EXPORTER_OTLP_ENDPOINT`] is not +/// present in the environment. +/// +/// Points to a locally-running OpenTelemetry Collector on the standard +/// **gRPC** port `4317`. Note that OTLP over HTTP typically uses `4318` --- +/// make sure your Collector is actually listening for **gRPC** traffic on the +/// port you configure. +pub const DEFAULT_OTLP_ENDPOINT: &str = "http://localhost:4317"; + +/// Singleton that owns the concrete `SdkMeterProvider` instance. +/// OnceLock guarantees single initialisation, we avoid accidentally creating two providers (and +/// two background export tasks) if `init_opentelemetry()` were ever called +/// twice. +/// +/// # Thread safety +/// +/// `OnceLock` is `Sync`, so the static can be read safely from any thread +/// or Tokio task once populated. +static METER_PROVIDER: OnceLock = OnceLock::new(); +/// docs: +/// Initialise the OpenTelemetry SDK, wire up the OTLP/gRPC exporter, and +/// return a [`Meter`] ready for instrumenting the `metrics` crate. +/// +/// 1. Read the endpoint from [`OTEL_EXPORTER_OTLP_ENDPOINT`] with the +/// hard-coded default [`DEFAULT_OTLP_ENDPOINT`]. +/// 2. Build a `MetricExporter` using the Tonic / gRPC transport: +/// - with_tonic()` enables the Tonic-based gRPC client. +/// - `with_endpoint()` sets the target Collector URL. +/// - `with_timeout(Duration::from_secs(10))` caps each export RPC to 10 +/// seconds; if the Collector is unreachable the RPC aborts instead of +/// hanging indefinitely. +/// 3. Wrap the exporter in a `PeriodicReader`. The reader collects +/// aggregated metrics from every instrument every 5 seconds and hands +/// them to the exporter. This is the "push" model --- metrics leave the +/// process automatically without an external scraper. +/// 4. Construct an `SdkMeterProvider` and register it as the global +/// meter provider (`global::set_meter_provider`). The global handle is +/// needed for instrumenting code spawned in other Tokio tasks (see +/// [`helpers::event_listener`](crate::helpers::event_listener)). +/// 5. Keep a clone of the concrete provider in `METER_PROVIDER` so that +/// [`shutdown_opentelemetry`] can later call `SdkMeterProvider::shutdown()`. +/// 6. Create a `Meter named `"cortexbrain-metrics"` and return it. +/// +/// Potential causes of errors: +/// +/// * An invalid endpoint URL (malformed string). +/// * Network-level failure during exporter construction. +/// * The provider already having been initialised +/// +pub fn init_opentelemetry() -> Result { + let endpoint = + env::var(OTEL_EXPORTER_OTLP_ENDPOINT).unwrap_or_else(|_| DEFAULT_OTLP_ENDPOINT.to_string()); + + let exporter = MetricExporter::builder() + .with_tonic() + .with_endpoint(endpoint) + .with_timeout(Duration::from_secs(10)) + .build()?; + + let reader = PeriodicReader::builder(exporter) + .with_interval(Duration::from_secs(5)) + .build(); + + let provider = SdkMeterProvider::builder().with_reader(reader).build(); + + // Make the provider globally discoverable. This clone is cheap because + // SdkMeterProvider is an Arc-backed handle. + global::set_meter_provider(provider.clone()); + + // Stash the concrete handle so shutdown_opentelemetry can flush. + METER_PROVIDER + .set(provider.clone()) + .map_err(|_| anyhow::anyhow!("OpenTelemetry meter provider already initialised"))?; + + let meter = provider.meter("cortexbrain-metrics"); + Ok(meter) +} +/// docs: +/// Flush every buffered metric to the OTLP collector and shut down the SDK. +pub fn shutdown_opentelemetry() { + if let Some(provider) = METER_PROVIDER.get() + && let Err(e) = provider.shutdown() + { + tracing::error!("Failed to shut down OpenTelemetry meter provider: {:?}", e); + } +} From 149abf6dd1fcd88f99dc9a56e6d3ce1a441b03da Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 1 Jun 2026 19:11:55 +0200 Subject: [PATCH 14/17] (chore): updated dependencies --- cli/Cargo.lock | 91 ++++++++++++------------ core/Cargo.lock | 98 ++++++++++++++------------ core/common/Cargo.toml | 11 +-- core/src/components/metrics/Cargo.toml | 3 + 4 files changed, 106 insertions(+), 97 deletions(-) diff --git a/cli/Cargo.lock b/cli/Cargo.lock index 0fea51d..7c843ec 100644 --- a/cli/Cargo.lock +++ b/cli/Cargo.lock @@ -361,8 +361,10 @@ dependencies = [ "opentelemetry", "opentelemetry-appender-tracing", "opentelemetry-otlp", + "opentelemetry-semantic-conventions", "opentelemetry-stdout", "opentelemetry_sdk", + "tokio", "tracing", "tracing-subscriber", ] @@ -992,16 +994,6 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" -[[package]] -name = "iri-string" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" -dependencies = [ - "memchr", - "serde", -] - [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -1266,9 +1258,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "opentelemetry" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +checksum = "b0142c63252a9e054e68a4c61a5778f7b14f576274d593f8ce883d191a099682" dependencies = [ "futures-core", "futures-sink", @@ -1280,9 +1272,9 @@ dependencies = [ [[package]] name = "opentelemetry-appender-tracing" -version = "0.31.1" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2" +checksum = "2c0080f0dc1d7c786f467cd85a4e395fcab11ee852004f39a29a18ab7c25d837" dependencies = [ "opentelemetry", "tracing", @@ -1292,9 +1284,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" +checksum = "5683015d09e2df236ef005b17f6f196f0d5f6313c4fa43a7b6a53b52776e4331" dependencies = [ "async-trait", "bytes", @@ -1305,9 +1297,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf" +checksum = "9966929966d17620d7c316c643ba62631826e10021409357772d5eea84f62c35" dependencies = [ "http", "opentelemetry", @@ -1319,14 +1311,14 @@ dependencies = [ "thiserror 2.0.16", "tokio", "tonic", - "tracing", + "tonic-types", ] [[package]] name = "opentelemetry-proto" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" +checksum = "56d658ba1faf63f7b9c492cfbe6e0ec365440a16132d3270c1065f7b33f1b638" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -1335,11 +1327,17 @@ dependencies = [ "tonic-prost", ] +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ca2f98a0437b427b4b08f19f1caa3c44db885a202bc12cfea13d6c702243d68" + [[package]] name = "opentelemetry-stdout" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc8887887e169414f637b18751487cce4e095be787d23fad13c454e2fb1b3811" +checksum = "a1b1c6a247d79091f0062a5f4bd058589525cf987a8d4c169440d9c1be72f0ad" dependencies = [ "chrono", "opentelemetry", @@ -1348,15 +1346,16 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.31.0" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" +checksum = "9b59f80e1ac4d5ff7a2db8fb6c80badb7f0f3f858211fba08dd9aaec750894f9" dependencies = [ "futures-channel", "futures-executor", "futures-util", "opentelemetry", "percent-encoding", + "portable-atomic", "rand", "thiserror 2.0.16", "tokio", @@ -1502,6 +1501,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "potential_utf" version = "0.1.4" @@ -1708,9 +1713,9 @@ checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" [[package]] name = "reqwest" -version = "0.12.24" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" +checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3" dependencies = [ "base64", "bytes", @@ -1726,9 +1731,6 @@ dependencies = [ "log", "percent-encoding", "pin-project-lite", - "serde", - "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", "tower", @@ -1926,18 +1928,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - [[package]] name = "serde_yaml" version = "0.9.34+deprecated" @@ -2273,6 +2263,17 @@ dependencies = [ "tonic-prost", ] +[[package]] +name = "tonic-types" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a875a902255423d34c1f20838ab374126db8eb41625b7947a1d54113b0b7399" +dependencies = [ + "prost", + "prost-types", + "tonic", +] + [[package]] name = "tower" version = "0.5.2" @@ -2294,9 +2295,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.6" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "base64", "bitflags", @@ -2304,13 +2305,13 @@ dependencies = [ "futures-util", "http", "http-body", - "iri-string", "mime", "pin-project-lite", "tower", "tower-layer", "tower-service", "tracing", + "url", ] [[package]] diff --git a/core/Cargo.lock b/core/Cargo.lock index 745a66d..6ae4f98 100644 --- a/core/Cargo.lock +++ b/core/Cargo.lock @@ -415,8 +415,10 @@ dependencies = [ "opentelemetry", "opentelemetry-appender-tracing", "opentelemetry-otlp", + "opentelemetry-semantic-conventions", "opentelemetry-stdout", "opentelemetry_sdk", + "tokio", "tracing", "tracing-subscriber", ] @@ -1034,16 +1036,6 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" -[[package]] -name = "iri-string" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" -dependencies = [ - "memchr", - "serde", -] - [[package]] name = "itertools" version = "0.14.0" @@ -1239,6 +1231,9 @@ dependencies = [ "cortexbrain-common", "libc", "nix", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry_sdk", "tokio", "tracing", "tracing-subscriber", @@ -1355,9 +1350,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "opentelemetry" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +checksum = "b0142c63252a9e054e68a4c61a5778f7b14f576274d593f8ce883d191a099682" dependencies = [ "futures-core", "futures-sink", @@ -1369,9 +1364,9 @@ dependencies = [ [[package]] name = "opentelemetry-appender-tracing" -version = "0.31.1" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2" +checksum = "2c0080f0dc1d7c786f467cd85a4e395fcab11ee852004f39a29a18ab7c25d837" dependencies = [ "opentelemetry", "tracing", @@ -1381,9 +1376,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" +checksum = "5683015d09e2df236ef005b17f6f196f0d5f6313c4fa43a7b6a53b52776e4331" dependencies = [ "async-trait", "bytes", @@ -1394,9 +1389,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf" +checksum = "9966929966d17620d7c316c643ba62631826e10021409357772d5eea84f62c35" dependencies = [ "http", "opentelemetry", @@ -1408,14 +1403,14 @@ dependencies = [ "thiserror 2.0.17", "tokio", "tonic", - "tracing", + "tonic-types", ] [[package]] name = "opentelemetry-proto" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" +checksum = "56d658ba1faf63f7b9c492cfbe6e0ec365440a16132d3270c1065f7b33f1b638" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -1424,11 +1419,17 @@ dependencies = [ "tonic-prost", ] +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ca2f98a0437b427b4b08f19f1caa3c44db885a202bc12cfea13d6c702243d68" + [[package]] name = "opentelemetry-stdout" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc8887887e169414f637b18751487cce4e095be787d23fad13c454e2fb1b3811" +checksum = "a1b1c6a247d79091f0062a5f4bd058589525cf987a8d4c169440d9c1be72f0ad" dependencies = [ "chrono", "opentelemetry", @@ -1437,15 +1438,16 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.31.0" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" +checksum = "9b59f80e1ac4d5ff7a2db8fb6c80badb7f0f3f858211fba08dd9aaec750894f9" dependencies = [ "futures-channel", "futures-executor", "futures-util", "opentelemetry", "percent-encoding", + "portable-atomic", "rand", "thiserror 2.0.17", "tokio", @@ -1585,6 +1587,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "potential_utf" version = "0.1.4" @@ -1792,9 +1800,9 @@ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "reqwest" -version = "0.12.24" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" +checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3" dependencies = [ "base64", "bytes", @@ -1810,9 +1818,6 @@ dependencies = [ "log", "percent-encoding", "pin-project-lite", - "serde", - "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", "tower", @@ -2020,18 +2025,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - [[package]] name = "serde_yaml" version = "0.9.34+deprecated" @@ -2219,9 +2212,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.48.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ "bytes", "libc", @@ -2361,6 +2354,17 @@ dependencies = [ "tonic-prost", ] +[[package]] +name = "tonic-types" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a875a902255423d34c1f20838ab374126db8eb41625b7947a1d54113b0b7399" +dependencies = [ + "prost", + "prost-types", + "tonic", +] + [[package]] name = "tower" version = "0.5.2" @@ -2382,9 +2386,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.6" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "base64", "bitflags", @@ -2392,13 +2396,13 @@ dependencies = [ "futures-util", "http", "http-body", - "iri-string", "mime", "pin-project-lite", "tower", "tower-layer", "tower-service", "tracing", + "url", ] [[package]] diff --git a/core/common/Cargo.toml b/core/common/Cargo.toml index ee50e2b..e1c39c5 100644 --- a/core/common/Cargo.toml +++ b/core/common/Cargo.toml @@ -16,15 +16,16 @@ anyhow = "1.0" kube = { version = "2.0.1", features = ["client"] } k8s-openapi = { version = "0.26.0", features = ["v1_34"] } aya = "0.13.1" -opentelemetry = "0.31.0" -opentelemetry_sdk = { version = "0.31.0", features = ["logs", "rt-tokio"] } -opentelemetry-stdout = { version = "0.31.0", features = ["logs"] } -opentelemetry-appender-tracing = "0.31.1" -opentelemetry-otlp = { version = "0.31.0", features = ["logs", "grpc-tonic"] } +opentelemetry = "0.32.0" +opentelemetry_sdk = { version = "0.32.0", features = ["logs", "rt-tokio"] } +opentelemetry-stdout = { version = "0.32.0", features = ["logs"] } +opentelemetry-appender-tracing = "0.32.0" +opentelemetry-otlp = { version = "0.32.0", features = ["logs", "grpc-tonic"] } bytemuck = "1.25.0" bytes = "1.11.0" bytemuck_derive = "1.10.2" tokio = "1.49.0" +opentelemetry-semantic-conventions = "0.32.0" [features] map-handlers = [] diff --git a/core/src/components/metrics/Cargo.toml b/core/src/components/metrics/Cargo.toml index c8dcb5b..1c7d420 100644 --- a/core/src/components/metrics/Cargo.toml +++ b/core/src/components/metrics/Cargo.toml @@ -28,3 +28,6 @@ cortexbrain-common = { path = "../../../common/", features = [ "network-structs" ] } nix = { version = "0.30.1", features = ["net"] } +opentelemetry = "0.32.0" +opentelemetry_sdk = "0.32.0" +opentelemetry-otlp = { version = "0.32.0", features = ["grpc-tonic"] } From 7ee399e84a1ae1707274c59a01fec1082cda1f8e Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 1 Jun 2026 19:16:18 +0200 Subject: [PATCH 15/17] [#175]: implemented the opentelemetry metrics export using the function in the common crate --- core/src/components/metrics/src/helpers.rs | 94 +++++++++++++++------- core/src/components/metrics/src/main.rs | 58 ++++++++----- core/src/components/metrics/src/mod.rs | 5 +- 3 files changed, 104 insertions(+), 53 deletions(-) diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index 843f45d..804e930 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -1,14 +1,34 @@ use anyhow::anyhow; use aya::util::online_cpus; use cortexbrain_common::map_handlers::map_manager; -use cortexbrain_common::{ - buffer_type::{BufferSize, BufferType, read_perf_buffer}, - map_handlers::BpfMapsData, -}; +use cortexbrain_common::{buffer_type::BufferSize, map_handlers::BpfMapsData}; +use opentelemetry::metrics::Meter; +use std::sync::Arc; use tokio::signal; use tracing::{error, info}; -pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> { +use cortexbrain_common::buffer_type::{BufferType, read_perf_buffer}; +use cortexbrain_common::otel_metrics::Metrics; + +/// Listen for eBPF perf-buffer events and record OpenTelemetry metrics. +/// +/// This function bridges the eBPF perf-buffer layer with the OpenTelemetry +/// metrics pipeline. It opens per-CPU buffers for the two maps of interest +/// (`net_metrics` and `time_stamp_events`), spawns asynchronous consumers, +/// and parks until a `Ctrl-C` signal is received or one of the consumers +/// terminates. +/// +/// # Arguments +/// +/// -`bpf_maps` – handles for the pinned BPF maps produced by +/// [`cortexbrain_common::map_handlers::map_pinner`]. +/// - `meter` – an initialised OpenTelemetry [`Meter`]. +/// +/// # Errors +/// +/// Returns `Err` if the map manager or CPU enumeration fails. +/// +pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), anyhow::Error> { info!("Getting CPU count..."); let mut maps = map_manager(bpf_maps)?; @@ -35,48 +55,63 @@ pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> info!("Perf buffers created successfully"); - let (time_stamp_events_array, time_stamp_events_perf_buffer) = maps + let (_time_stamp_events_array, time_stamp_events_perf_buffer) = maps .remove("time_stamp_events") .expect("Cannot create time_stamp_events_buffer"); - let (net_perf_array, net_perf_buffer) = maps + let (_net_perf_array, net_perf_buffer) = maps .remove("net_metrics") .expect("Cannot create net_perf_buffer"); - // Create proper sized buffers + // Allocate byte-buffers sized for each structure type let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); + let metrics = Arc::new(Metrics::new(&meter)); + info!("Starting event listener tasks..."); - let metrics_map_displayer = tokio::spawn(async move { - read_perf_buffer( - net_perf_buffer, - net_metrics_buffers, - BufferType::NetworkMetrics, - ) - .await; - }); - - let time_stamp_events_displayer = tokio::spawn(async move { - read_perf_buffer( - time_stamp_events_perf_buffer, - time_stamp_events_buffers, - BufferType::TimeStampMetrics, - ) - .await; - }); + + let net_metrics_handle = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = net_perf_buffer; + let mut buffers = net_metrics_buffers; + tokio::spawn(async move { + read_perf_buffer( + array_buffers, + buffers, + BufferType::NetworkMetrics, + Some(metrics), + ) + .await; + }) + }; + + let time_stamp_handle = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = time_stamp_events_perf_buffer; + let mut buffers = time_stamp_events_buffers; + tokio::spawn(async move { + read_perf_buffer( + array_buffers, + buffers, + BufferType::TimeStampMetrics, + Some(metrics), + ) + .await; + }) + }; info!("Event listeners started, entering main loop..."); tokio::select! { - result = metrics_map_displayer => { + result = net_metrics_handle => { if let Err(e) = result { - error!("Metrics map displayer task failed: {:?}", e); + error!("Network metrics task failed: {:?}", e); } } - result = time_stamp_events_displayer => { + result = time_stamp_handle => { if let Err(e) = result { - error!("Time stamp events displayer task failed: {:?}", e); + error!("Timestamp events task failed: {:?}", e); } } @@ -85,6 +120,5 @@ pub async fn event_listener(bpf_maps: BpfMapsData) -> Result<(), anyhow::Error> } } - // return success Ok(()) } diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index e5558eb..0211be6 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -1,4 +1,15 @@ -use anyhow::{Context, Ok}; +//! CortexBrain metrics service – eBPF-based telemetry with OpenTelemetry export. +//! +//! This binary is the node-level metrics agent for CortexBrain. It: +//! +//! 1. Initialises an OpenTelemetry metrics pipeline (OTLP / gRPC). +//! 2. Loads a compiled eBPF object and pins its maps to the BPF filesystem. +//! 3. Attaches a set of kernel kprobe programs. +//! 4. Starts asynchronous consumers that read per-CPU perf buffers and +//! emit OpenTelemetry instruments for every event. +//! 5. Blocks until `Ctrl-C` is received, then shuts down cleanly. + +use anyhow::Context; use aya::Ebpf; use std::{ env, fs, @@ -6,9 +17,10 @@ use std::{ sync::{Arc, Mutex}, }; use tracing::{error, info}; - mod helpers; +mod otel_init; use crate::helpers::event_listener; +use crate::otel_init::{init_opentelemetry, shutdown_opentelemetry}; use cortexbrain_common::{ constants, @@ -19,12 +31,14 @@ use cortexbrain_common::{ #[tokio::main] async fn main() -> Result<(), anyhow::Error> { - //init tracing subscriber - let otlp_provider = otlp_logger_init("metrics-service".to_string()); + let _otlp_log_provider = otlp_logger_init("metrics-service".to_string()); info!("Starting metrics service..."); info!("fetching data"); + let meter = + init_opentelemetry().context("Failed to initialise OpenTelemetry metrics pipeline")?; + let bpf_path = env::var(constants::BPF_PATH).context("BPF_PATH environment variable required")?; let data = fs::read(Path::new(&bpf_path)).context("Failed to load file from path")?; @@ -35,30 +49,33 @@ async fn main() -> Result<(), anyhow::Error> { info!("Running Ebpf logger"); info!("loading programs"); - let bpf_map_save_path = std::env::var(constants::PIN_MAP_PATH) - .context("PIN_MAP_PATH environment variable required")?; + + let bpf_map_save_path = + env::var(constants::PIN_MAP_PATH).context("PIN_MAP_PATH environment variable required")?; let map_data = vec!["time_stamp_events".to_string(), "net_metrics".to_string()]; match init_bpf_maps(bpf.clone(), map_data) { - std::result::Result::Ok(bpf_maps) => { + Ok(bpf_maps) => { info!("BPF maps loaded successfully"); let pin_path = std::path::PathBuf::from(&bpf_map_save_path); info!("About to call map_pinner with path: {:?}", pin_path); + match map_pinner(bpf_maps, &pin_path) { - std::result::Result::Ok(maps) => { + Ok(maps) => { info!("BPF maps pinned successfully to {}", bpf_map_save_path); { load_program(bpf.clone(), "metrics_tracer", "tcp_identify_packet_loss") .context( - "An error occured during the execution of load_program function", + "An error occurred during the execution of load_program function", )?; - load_program(tcp_bpf,"tcp_v4_connect","tcp_v4_connect") - .context("An error occured during the execution of load_and_attach_tcp_programs function")?; - load_program(tcp_v6_bpf,"tcp_v6_connect","tcp_v6_connect") - .context("An error occured during the execution of load_and_attach_tcp_programs function")?; + load_program(tcp_bpf, "tcp_v4_connect", "tcp_v4_connect") + .context("An error occurred during the execution of load_and_attach_tcp_programs function")?; + + load_program(tcp_v6_bpf, "tcp_v6_connect", "tcp_v6_connect") + .context("An error occurred during the execution of load_and_attach_tcp_programs function")?; load_program( tcp_rev_bpf, @@ -66,23 +83,24 @@ async fn main() -> Result<(), anyhow::Error> { "tcp_rcv_state_process", ) .context( - "An error occured during the execution of load_program function", + "An error occurred during the execution of load_program function", )?; } - event_listener(maps).await?; + + // Hand off to the async event consumer + event_listener(maps, meter).await } Err(e) => { error!("Error pinning BPF maps: {:?}", e); - return Err(e); + shutdown_opentelemetry(); + Err(e) } } } Err(e) => { error!("Error initializing BPF maps: {:?}", e); - let _ = otlp_provider.shutdown(); - return Err(e); + shutdown_opentelemetry(); + Err(e) } } - - Ok(()) } diff --git a/core/src/components/metrics/src/mod.rs b/core/src/components/metrics/src/mod.rs index 8414b63..c5e2806 100644 --- a/core/src/components/metrics/src/mod.rs +++ b/core/src/components/metrics/src/mod.rs @@ -1,3 +1,2 @@ -mod structs; -mod enums; -mod helpers; \ No newline at end of file +mod helpers; +mod otel_init; From b433a13e3e1d7badc2bdb6885fe862752b083232 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 1 Jun 2026 19:17:52 +0200 Subject: [PATCH 16/17] [#175]: added metrics exporter in the otel-collector-config ConfigMap. Added updated image in the metrics.yaml with the new implementations --- core/src/testing/metrics.yaml | 2 +- core/src/testing/otel_agent.yaml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/core/src/testing/metrics.yaml b/core/src/testing/metrics.yaml index 262b28f..a106c4e 100644 --- a/core/src/testing/metrics.yaml +++ b/core/src/testing/metrics.yaml @@ -19,7 +19,7 @@ spec: hostNetwork: true containers: - name: metrics - image: lorenzotettamanti/cortexflow-metrics:0.1.2-test12 + image: lorenzotettamanti/cortexflow-metrics:otel-test-1 command: ["/bin/bash", "-c"] args: - | diff --git a/core/src/testing/otel_agent.yaml b/core/src/testing/otel_agent.yaml index 71b7e08..c5165ac 100644 --- a/core/src/testing/otel_agent.yaml +++ b/core/src/testing/otel_agent.yaml @@ -33,6 +33,9 @@ data: logs: receivers: [otlp] exporters: [otlp, logging] + metrics: + receivers: [otlp] + exporters: [otlp, logging] --- apiVersion: apps/v1 @@ -132,6 +135,10 @@ data: receivers: [otlp] processors: [memory_limiter] exporters: [logging] + metrics: + receivers: [otlp] + processors: [memory_limiter] + exporters: [logging] --- apiVersion: v1 From 836f0b4ff2766628d887317d56b5329364e4c135 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Fri, 5 Jun 2026 21:53:08 +0200 Subject: [PATCH 17/17] (fix): updated metrics.yaml manifest --- core/src/testing/metrics.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/testing/metrics.yaml b/core/src/testing/metrics.yaml index a106c4e..8a6c7d8 100644 --- a/core/src/testing/metrics.yaml +++ b/core/src/testing/metrics.yaml @@ -19,7 +19,7 @@ spec: hostNetwork: true containers: - name: metrics - image: lorenzotettamanti/cortexflow-metrics:otel-test-1 + image: lorenzotettamanti/cortexflow-metrics:otel-test-2 command: ["/bin/bash", "-c"] args: - |