Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 104 additions & 39 deletions sigsegv-monitor.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
#include <bpf/bpf_tracing.h>
#include "sigsegv-monitor.h"

// By default is commented: a lot of #PF events are hit
// so enable only if it is acceptable.
// #define TRACE_PF_CR2

// if /sys/kernel/tracing/trace_on is set to 1,
// cat /sys/kernel/tracing/trace
// will show the bpf_printk() output
Expand All @@ -21,12 +17,60 @@ struct trace_event_raw_page_fault_user {
char __data[0];
};

struct cr2_stat {
u64 cr2;
u64 err;
u64 tai;
};

struct cr2_stats {
struct cr2_stat stat[MAX_USER_PF_ENTRIES];
u64 head;
u64 count;
};

struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, 1024);
__type(key, u32);
__type(value, u64);
} tgid_cr2 SEC(".maps");
__type(value, struct cr2_stats);
} pid_cr2 SEC(".maps");

inline void cr2stats_init(struct cr2_stats* stats) {
stats->head = 0;
stats->count = 0;
}

inline void cr2stats_push(struct cr2_stats* stats, struct cr2_stat* value) {
if (stats->head < MAX_USER_PF_ENTRIES) {
stats->stat[stats->head] = *value;

if (++stats->head == MAX_USER_PF_ENTRIES) {
stats->head = 0;
}

if (stats->count < MAX_USER_PF_ENTRIES) {
++stats->count;
}
}
}

// The `index` parameter here is not an index in the array, but an index in the ring buffer,
// i.e. passing an index 0 would return the oldest element in the ring buffer.
inline struct cr2_stat* cr2stats_get(struct cr2_stats* stats, u32 index) {
if (stats->count == MAX_USER_PF_ENTRIES) {
index += stats->head;
if (index >= MAX_USER_PF_ENTRIES) {
index -= MAX_USER_PF_ENTRIES;
}
}

if (index < MAX_USER_PF_ENTRIES) {
return stats->stat + index;
}

return NULL;
}
#endif

// Output map (for user space)
Expand Down Expand Up @@ -75,24 +119,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
bpf_probe_read_kernel_str(&event->tgleader_comm, sizeof(event->tgleader_comm), &task->group_leader->comm);
// TODO: can the acquisition of pidns_tgid, pidns_pid be made more robust / simplified?
{
struct pid const* thread_pid = task->thread_pid;
unsigned int const level = thread_pid->level;
// thread_pid->numbers is a size-one flexible array member (type numbers[1])
// => cannot perform bounds-check against BTF information
// => need bpf_probe_read_kernel to read from indices potentially > 1
struct upid const* upid_inv = &thread_pid->numbers[level];
event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call
}
{
struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID];
unsigned int const level = tgid_pid->level;
struct upid const* tgid_upid_inv = &tgid_pid->numbers[level];
// TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
// TODO: don't we need RCU here?
event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr);
}

event->regs.trapno = task->thread.trap_nr; // TODO: also copy the other fields like cr2 and error_code
struct pid const* thread_pid = task->thread_pid;
unsigned int const level = thread_pid->level;
// thread_pid->numbers is a size-one flexible array member (type numbers[1])
// => cannot perform bounds-check against BTF information
// => need bpf_probe_read_kernel to read from indices potentially > 1
struct upid const* upid_inv = &thread_pid->numbers[level];
event->pidns_pid = BPF_CORE_READ(upid_inv, nr); // we already have implicit CO-RE, but we need the probe function call
}
{
struct pid const* tgid_pid = task->signal->pids[PIDTYPE_TGID];
unsigned int const level = tgid_pid->level;
struct upid const* tgid_upid_inv = &tgid_pid->numbers[level];
// TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
// TODO: don't we need RCU here?
event->pidns_tgid = BPF_CORE_READ(tgid_upid_inv, nr);
}

event->regs.trapno = task->thread.trap_nr;
event->regs.err = task->thread.error_code;

// TODO: how are these regs acquired?
Expand All @@ -119,18 +163,28 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
event->regs.flags = regs->flags;

event->regs.cr2 = task->thread.cr2;
event->regs.cr2_fault = -1;

#ifdef TRACE_PF_CR2
u32 tgid = task->tgid;
u64 *cr2 = bpf_map_lookup_elem(&tgid_cr2, &tgid);
}

if (cr2) {
event->regs.cr2_fault = *cr2;
bpf_map_delete_elem(&tgid_cr2, &tgid);
event->pf_count = 0;
#ifdef TRACE_PF_CR2
u32 pid = task->pid;
struct cr2_stats *cr2stats = bpf_map_lookup_elem(&pid_cr2, &pid);

if (cr2stats) {
for (u32 i = 0; i < cr2stats->count && i < MAX_USER_PF_ENTRIES; i++) {
struct cr2_stat* stat = cr2stats_get(cr2stats, i);
if (stat) {
event->pf[i].cr2 = stat->cr2;
event->pf[i].err = stat->err;
event->pf[i].tai = stat->tai;

++event->pf_count;
}
}
#endif

bpf_map_delete_elem(&pid_cr2, &pid);
}
#endif

// TODO: when is this snapshot taken? or does the CPU not do LBR in the kernel?
long ret = bpf_get_branch_snapshot(&event->lbr, sizeof(event->lbr), 0);
Expand All @@ -149,13 +203,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
#ifdef TRACE_PF_CR2
SEC("tracepoint/exceptions/page_fault_user")
int trace_page_fault(struct trace_event_raw_page_fault_user *ctx) {
u64 cr2;
u32 tgid;
struct cr2_stat stat;
u32 pid;

stat.cr2 = ctx->address;
stat.err = ctx->error_code;
stat.tai = bpf_ktime_get_tai_ns();
pid = (u32)bpf_get_current_pid_tgid();

cr2 = ctx->address;
tgid = bpf_get_current_pid_tgid() >> 32;
struct cr2_stats *cr2stats = bpf_map_lookup_elem(&pid_cr2, &pid);
if (cr2stats) {
cr2stats_push(cr2stats, &stat);
} else {
struct cr2_stats new_stats;
cr2stats_init(&new_stats);
cr2stats_push(&new_stats, &stat);

bpf_map_update_elem(&tgid_cr2, &tgid, &cr2, BPF_ANY);
bpf_map_update_elem(&pid_cr2, &pid, &new_stats, BPF_ANY);
}

return 0;
}
Expand Down
19 changes: 14 additions & 5 deletions sigsegv-monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,22 @@ void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) {
printf("\"flags\":\"0x%016llx\",", e->regs.flags);
printf("\"trapno\":\"0x%016llx\",", e->regs.trapno);
printf("\"err\":\"0x%016llx\",", e->regs.err);
printf("\"cr2\":\"0x%016llx\",", e->regs.cr2);
if (e->regs.cr2_fault != (u64)-1)
printf("\"cr2_fault\":\"0x%016llx\"", e->regs.cr2_fault);
else
printf("\"cr2_fault\":null");
printf("\"cr2\":\"0x%016llx\"", e->regs.cr2);
printf("},");

#ifdef TRACE_PF_CR2
printf("\"page_faults\": [");
for_each(i, e->pf_count)
{
printf("{\"cr2\":\"0x%016llx\",\"err\":\"0x%016llx\",\"tai\":%llu}", e->pf[i].cr2, e->pf[i].err, e->pf[i].tai);

if (i + 1 != e->pf_count) {
printf(",");
}
}
printf("],");
#endif

printf("\"lbr\":[");
int lbr_limit = (e->lbr_count < MAX_LBR_ENTRIES) ? e->lbr_count : MAX_LBR_ENTRIES;
for_each(i, lbr_limit) {
Expand Down
14 changes: 12 additions & 2 deletions sigsegv-monitor.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
#pragma once


#define MAX_LBR_ENTRIES 32
#define MAX_USER_PF_ENTRIES 16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

16 is fine if the ring buffer is for a single thread, but I fear that with dozens of threads, there might be too many PF, so any interesting one might be rotated out by the time we land in signal_generate. But due to the locking requirement (see my comment in the .bpf.c file) I think we should split this into a per-thread data structure (or per-CPU but while recording the pid, not sure...).


#define TRACE_PF_CR2

struct page_fault_info_t {
u64 cr2;
u64 err;
u64 tai;
};

struct user_regs_t {
u64 rip;
Expand All @@ -25,7 +33,6 @@ struct user_regs_t {
u64 trapno;
u64 err;
u64 cr2;
u64 cr2_fault;
};

// WARNING: this is for the SENDING process (e.g. pid) of the signal!
Expand All @@ -45,4 +52,7 @@ struct event_t {
struct perf_branch_entry lbr[MAX_LBR_ENTRIES];

u64 tai; // time atomic international

u32 pf_count;
struct page_fault_info_t pf[MAX_USER_PF_ENTRIES];
};