44#include <bpf/bpf_tracing.h>
55#include "sigsegv-monitor.h"
66
7- // By default is commented: a lot of #PF events are hit
8- // so enable only if it is acceptable.
9- // #define TRACE_PF_CR2
10-
117// if /sys/kernel/tracing/trace_on is set to 1,
128// cat /sys/kernel/tracing/trace
139// will show the bpf_printk() output
@@ -21,12 +17,60 @@ struct trace_event_raw_page_fault_user {
2117 char __data [0 ];
2218};
2319
20+ struct cr2_stat {
21+ u64 cr2 ;
22+ u64 err ;
23+ u64 tai ;
24+ };
25+
26+ struct cr2_stats {
27+ struct cr2_stat stat [MAX_USER_PF_ENTRIES ];
28+ u64 head ;
29+ u64 count ;
30+ };
31+
2432struct {
2533 __uint (type , BPF_MAP_TYPE_HASH );
2634 __uint (max_entries , 1024 );
2735 __type (key , u32 );
28- __type (value , u64 );
36+ __type (value , struct cr2_stats );
2937} tgid_cr2 SEC (".maps" );
38+
39+ inline void cr2stats_init (struct cr2_stats * stats ) {
40+ stats -> head = 0 ;
41+ stats -> count = 0 ;
42+ }
43+
44+ inline void cr2stats_push (struct cr2_stats * stats , struct cr2_stat * value ) {
45+ if (stats -> head < MAX_USER_PF_ENTRIES ) {
46+ stats -> stat [stats -> head ] = * value ;
47+
48+ if (++ stats -> head == MAX_USER_PF_ENTRIES ) {
49+ stats -> head = 0 ;
50+ }
51+
52+ if (++ stats -> count == MAX_USER_PF_ENTRIES ) {
53+ stats -> count = 0 ;
54+ }
55+ }
56+ }
57+
58+ // The `index` parameter here is not an index in the array, but an index in the ring buffer,
59+ // i.e. passing an index 0 would return the oldest element in the ring buffer.
60+ inline struct cr2_stat * cr2stats_get (struct cr2_stats * stats , u32 index ) {
61+ if (stats -> count == MAX_USER_PF_ENTRIES ) {
62+ index += stats -> head ;
63+ if (index >= MAX_USER_PF_ENTRIES ) {
64+ index -= MAX_USER_PF_ENTRIES ;
65+ }
66+ }
67+
68+ if (index < MAX_USER_PF_ENTRIES ) {
69+ return stats -> stat + index ;
70+ }
71+
72+ return NULL ;
73+ }
3074#endif
3175
3276// Output map (for user space)
@@ -75,24 +119,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
75119 bpf_probe_read_kernel_str (& event -> tgleader_comm , sizeof (event -> tgleader_comm ), & task -> group_leader -> comm );
76120 // TODO: can the acquisition of pidns_tgid, pidns_pid be made more robust / simplified?
77121 {
78- struct pid const * thread_pid = task -> thread_pid ;
79- unsigned int const level = thread_pid -> level ;
80- // thread_pid->numbers is a size-one flexible array member (type numbers[1])
81- // => cannot perform bounds-check against BTF information
82- // => need bpf_probe_read_kernel to read from indices potentially > 1
83- struct upid const * upid_inv = & thread_pid -> numbers [level ];
84- event -> pidns_pid = BPF_CORE_READ (upid_inv , nr ); // we already have implicit CO-RE, but we need the probe function call
85- }
86- {
87- struct pid const * tgid_pid = task -> signal -> pids [PIDTYPE_TGID ];
88- unsigned int const level = tgid_pid -> level ;
89- struct upid const * tgid_upid_inv = & tgid_pid -> numbers [level ];
90- // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
91- // TODO: don't we need RCU here?
92- event -> pidns_tgid = BPF_CORE_READ (tgid_upid_inv , nr );
93- }
94-
95- event -> regs .trapno = task -> thread .trap_nr ; // TODO: also copy the other fields like cr2 and error_code
122+ struct pid const * thread_pid = task -> thread_pid ;
123+ unsigned int const level = thread_pid -> level ;
124+ // thread_pid->numbers is a size-one flexible array member (type numbers[1])
125+ // => cannot perform bounds-check against BTF information
126+ // => need bpf_probe_read_kernel to read from indices potentially > 1
127+ struct upid const * upid_inv = & thread_pid -> numbers [level ];
128+ event -> pidns_pid = BPF_CORE_READ (upid_inv , nr ); // we already have implicit CO-RE, but we need the probe function call
129+ }
130+ {
131+ struct pid const * tgid_pid = task -> signal -> pids [PIDTYPE_TGID ];
132+ unsigned int const level = tgid_pid -> level ;
133+ struct upid const * tgid_upid_inv = & tgid_pid -> numbers [level ];
134+ // TODO: doesn't this return the pid in the NS of the tg leader, instead of the pid in the NS of the current thread?
135+ // TODO: don't we need RCU here?
136+ event -> pidns_tgid = BPF_CORE_READ (tgid_upid_inv , nr );
137+ }
138+
139+ event -> regs .trapno = task -> thread .trap_nr ;
96140 event -> regs .err = task -> thread .error_code ;
97141
98142 // TODO: how are these regs acquired?
@@ -119,18 +163,28 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
119163 event -> regs .flags = regs -> flags ;
120164
121165 event -> regs .cr2 = task -> thread .cr2 ;
122- event -> regs .cr2_fault = -1 ;
123-
124- #ifdef TRACE_PF_CR2
125- u32 tgid = task -> tgid ;
126- u64 * cr2 = bpf_map_lookup_elem (& tgid_cr2 , & tgid );
166+ }
127167
128- if (cr2 ) {
129- event -> regs .cr2_fault = * cr2 ;
130- bpf_map_delete_elem (& tgid_cr2 , & tgid );
168+ event -> pf_count = 0 ;
169+ #ifdef TRACE_PF_CR2
170+ u32 tgid = task -> tgid ;
171+ struct cr2_stats * cr2stats = bpf_map_lookup_elem (& tgid_cr2 , & tgid );
172+
173+ if (cr2stats ) {
174+ for (u32 i = 0 ; i < cr2stats -> count && i < MAX_USER_PF_ENTRIES ; i ++ ) {
175+ struct cr2_stat * stat = cr2stats_get (cr2stats , i );
176+ if (stat ) {
177+ event -> pf [i ].cr2 = stat -> cr2 ;
178+ event -> pf [i ].err = stat -> err ;
179+ event -> pf [i ].tai = stat -> tai ;
180+
181+ ++ event -> pf_count ;
182+ }
131183 }
132- #endif
184+
185+ bpf_map_delete_elem (& tgid_cr2 , & tgid );
133186 }
187+ #endif
134188
135189 // TODO: when is this snapshot taken? or does the CPU not do LBR in the kernel?
136190 long ret = bpf_get_branch_snapshot (& event -> lbr , sizeof (event -> lbr ), 0 );
@@ -149,13 +203,24 @@ int trace_sigsegv(struct trace_event_raw_signal_generate *ctx) {
149203#ifdef TRACE_PF_CR2
150204SEC ("tracepoint/exceptions/page_fault_user" )
151205int trace_page_fault (struct trace_event_raw_page_fault_user * ctx ) {
152- u64 cr2 ;
206+ struct cr2_stat stat ;
153207 u32 tgid ;
154208
155- cr2 = ctx -> address ;
209+ stat .cr2 = ctx -> address ;
210+ stat .err = ctx -> error_code ;
211+ stat .tai = bpf_ktime_get_tai_ns ();
156212 tgid = bpf_get_current_pid_tgid () >> 32 ;
157213
158- bpf_map_update_elem (& tgid_cr2 , & tgid , & cr2 , BPF_ANY );
214+ struct cr2_stats * cr2stats = bpf_map_lookup_elem (& tgid_cr2 , & tgid );
215+ if (cr2stats ) {
216+ cr2stats_push (cr2stats , & stat );
217+ } else {
218+ struct cr2_stats new_stats ;
219+ cr2stats_init (& new_stats );
220+ cr2stats_push (& new_stats , & stat );
221+
222+ bpf_map_update_elem (& tgid_cr2 , & tgid , & new_stats , BPF_ANY );
223+ }
159224
160225 return 0 ;
161226}
0 commit comments