trinity/utils.c at master · kernelslacker/trinity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <stdbool.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "breadcrumb_ring.h"
#include "child.h"
#include "debug.h"
#include "deferred-free.h"
#include "locks.h"
#include "objects.h"
#include "params.h"
#include "pc_format.h"
#include "pids.h"
#include "random.h"
#include "rnd.h"
#include "shm.h"
#include "signals.h"	// asb_copy_recover / asb_copy_active snapshot-copy guard
#include "stats.h"
#include "stats_ring.h"
#include "syscall.h"
#include "tables.h"
#include "trinity.h"
#include "utils.h"

/*
 * Use this allocator if you have an object a child writes to that you want
 * all other processes to see.
 *
 * Every allocation is tracked so that VM syscalls (munmap, madvise, mremap,
 * mprotect) can avoid clobbering trinity's own shared state.
 */

#ifdef CONFIG_GUARD_SHARED
/*
 * Runtime scope for the guard-page armour wired into __alloc_shared().
 * Initialised to GUARD_SCOPE_OFF; flipped to POOLS or ALL by parse_args()
 * when the operator passes --guard-shared[=pools|all].  Hot path:
 * __alloc_shared() reads this once per call.
 *
 *   OFF   - no guards, byte-identical to the legacy single-mmap path
 *           (modulo the runtime branch).  This is the production default.
 *   POOLS - guard only the long-lived regions tagged is_pool=true by
 *           their alloc site (kcov_shm, the shared str/obj heap, per-
 *           child childdata).  Bounded VMA cost, focused on the
 *           recurring corruption-witness clusters from the 2026-06-08
 *           overnight triages.
 *   ALL   - guard every alloc_shared() region, pool or not.  VMA cost
 *           scales with MAX_SHARED_ALLOCS; intended for short-run
 *           investigations where the writer might not be in the pools
 *           subset.  Warns + suggests raising vm.max_map_count at the
 *           flag-parse site.
 *
 * Off → __alloc_shared() and free_shared() collapse to today's exact
 * mmap / unregister behaviour at runtime as well, so a build that
 * compiled CONFIG_GUARD_SHARED in stays production-safe until the
 * operator opts in.
 */
enum guard_scope guard_shared_scope = GUARD_SCOPE_OFF;
#endif

static struct {
	unsigned long addr;
	unsigned long size;
#ifdef CONFIG_GUARD_SHARED
	/*
	 * 1 iff __alloc_shared() wrapped this region in PROT_NONE guard
	 * pages; 0 for a legacy single-mmap region (alloc_shared with
	 * guards off, or track_shared_region for an externally-mmap'd
	 * mapping).  free_shared() reads this to decide whether to
	 * derive a guarded span (PAGE + pages + PAGE) and munmap the
	 * whole span, or to munmap just (addr, size) like the legacy
	 * path.  child_fault_handler reads this in P1.2 to attribute a
	 * SEGV that lands in a guard page to its abutting region.
	 */
	uint8_t guarded;
#endif
} shared_regions[MAX_SHARED_ALLOCS];
unsigned int nr_shared_regions;

/*
 * Bounded overflow tail for registrations that arrive once
 * shared_regions[] is full.  Exists so range_overlaps_shared() (via the
 * bitmap, which is still updated) and range_in_tracked_shared() (via the
 * linear walk extension below) keep protecting fuzzed mm syscalls from
 * clobbering the untracked region instead of silently failing open.
 *
 * Intentionally small: 256 slots is "absorb a moderately over-budget
 * fleet host long enough to fail loudly and tell the operator to raise
 * MAX_SHARED_ALLOCS or move to dynamic resize", not "a second pool to
 * keep growing into".  Exhausting the tail BUG()s in both debug and
 * release; under-protection of a writable shared mapping is the failure
 * class the whole tracker exists to prevent and is never preferable to
 * a loud abort.
 */
#define SHARED_REGIONS_OVERFLOW_TAIL 256

static struct {
	unsigned long addr;
	unsigned long size;
#ifdef CONFIG_GUARD_SHARED
	uint8_t guarded;	/* parity with shared_regions[] above */
#endif
} shared_regions_overflow[SHARED_REGIONS_OVERFLOW_TAIL];
static unsigned int nr_shared_regions_overflow;

/*
 * Bitmap accelerator for range_overlaps_shared().  One bit per
 * SHARED_BITMAP_GRANULARITY-byte chunk of user VA; a set bit means at
 * least one byte in that chunk belongs to a registered shared region.
 *
 * The mm-syscall sanitisers (madvise/mremap/mprotect/munmap/mseal/mbind/
 * process_madvise/remap_file_pages/...) call range_overlaps_shared()
 * once per fuzzed call, often many times per child per second.  The
 * original linear scan over shared_regions[] is O(N) per query with N
 * easily reaching 100+ on a 32-child fleet (per-child childdata,
 * fd_event ring, kcov ring, plus the global reserve).  Replacing the
 * scan with this bitmap turns the hot path into one or two word loads
 * for the common single-page query.
 *
 * Granularity 2 MiB is the natural unit for the conservative
 * over-reject guarantee: any 2 MiB chunk that touches a shared region
 * gets its bit set, and a query whose footprint hits that chunk
 * rejects.  False positives are possible at chunk boundaries (a
 * non-shared page co-located in the same 2 MiB chunk as a shared
 * region rejects too), which is the SAFETY direction -- under-reject
 * would let a fuzzed mmap call clobber trinity's own shared state.
 *
 * Span 1<<47 covers the canonical x86_64 user VA on default
 * (4-level page table) kernels.  Regions registered outside the span
 * trip the BUG_ON in shared_bitmap_mark(); queries entirely outside
 * the span return false (no tracked region can live there because the
 * BUG_ON would have fired).  At 1 bit per 2 MiB, the bitmap is
 * 1<<26 bits = 8 MiB of BSS, but it is mostly zero pages: only the
 * 4 KiB pages that cover actually-set bits ever fault in, so true
 * resident growth is in the kilobytes for a typical fleet host where
 * shared regions cluster in the mmap arena near 0x7f000000....
 */
#define SHARED_BITMAP_GRANULARITY_LOG2	21UL	/* 2 MiB per bit */
#define SHARED_BITMAP_VA_LOG2		47UL	/* 128 TiB user VA span */
#define SHARED_BITMAP_VA_SPAN		(1UL << SHARED_BITMAP_VA_LOG2)
#define SHARED_BITMAP_NBITS		(SHARED_BITMAP_VA_SPAN >> SHARED_BITMAP_GRANULARITY_LOG2)
#define SHARED_BITMAP_BITS_PER_WORD	(8UL * sizeof(unsigned long))
#define SHARED_BITMAP_NWORDS		(SHARED_BITMAP_NBITS / SHARED_BITMAP_BITS_PER_WORD)

static unsigned long shared_region_bitmap[SHARED_BITMAP_NWORDS];

/*
 * Per-chunk refcount paired with shared_region_bitmap above.  Multiple
 * tracked regions may live in the same 2 MiB chunk (every alloc_shared
 * call rounds up to a chunk for bitmap purposes; nothing forbids two
 * adjacent mmaps landing in the same chunk).  The bit must stay set
 * until the LAST tracked region in the chunk is removed -- clearing it
 * on the first untrack would flip the safety invariant from
 * "over-reject" to "under-reject" for the surviving region in the
 * chunk, exactly the failure mode this whole guard exists to prevent.
 *
 * uint16_t covers the worst-case occupancy by a comfortable margin
 * (MAX_SHARED_ALLOCS + overflow tail = 4352 << 65535) and bumps BSS
 * from 8 MiB (the bitmap alone) to 8 MiB + 128 MiB.  Same lazy-faulting
 * argument as the bitmap: only chunks touched by registrations ever
 * fault their backing page in, so true resident growth stays in the
 * tens of KiB for the typical clustered fleet-host layout.
 */
static uint16_t shared_region_refcount[SHARED_BITMAP_NBITS];

static inline bool shared_bitmap_test(unsigned long bit)
{
	return (shared_region_bitmap[bit / SHARED_BITMAP_BITS_PER_WORD] >>
		(bit % SHARED_BITMAP_BITS_PER_WORD)) & 1UL;
}

static inline void shared_bitmap_set(unsigned long bit)
{
	shared_region_bitmap[bit / SHARED_BITMAP_BITS_PER_WORD] |=
		1UL << (bit % SHARED_BITMAP_BITS_PER_WORD);
}

static inline void shared_bitmap_clear(unsigned long bit)
{
	shared_region_bitmap[bit / SHARED_BITMAP_BITS_PER_WORD] &=
		~(1UL << (bit % SHARED_BITMAP_BITS_PER_WORD));
}

/*
 * Mark every 2 MiB chunk that intersects [addr, addr+size).  Called
 * from the tail of alloc_shared() and track_shared_region() so the
 * bitmap stays in sync with shared_regions[].  size==0 is a no-op
 * (matches the "empty region overlaps nothing" semantics callers rely
 * on).  An out-of-span registration BUG()s loudly: the linear-scan
 * predecessor would have caught such a region, so silently dropping it
 * here would flip the safety invariant from "over-reject" to
 * "under-reject" -- the exact failure mode this whole guard exists to
 * prevent.
 */
static void shared_bitmap_mark(unsigned long addr, unsigned long size)
{
	unsigned long end, first, last, bit;

	if (size == 0)
		return;

	if (addr >= SHARED_BITMAP_VA_SPAN ||
	    size > SHARED_BITMAP_VA_SPAN - addr) {
		outputerr("shared_bitmap_mark: region 0x%lx+0x%lx outside "
			  "1<<%lu user VA span; widen SHARED_BITMAP_VA_LOG2\n",
			  addr, size, SHARED_BITMAP_VA_LOG2);
		BUG("shared region outside bitmap span");
	}

	end = addr + size - 1;
	first = addr >> SHARED_BITMAP_GRANULARITY_LOG2;
	last = end >> SHARED_BITMAP_GRANULARITY_LOG2;

	for (bit = first; bit <= last; bit++) {
		if (shared_region_refcount[bit] == UINT16_MAX) {
			outputerr("shared_bitmap_mark: refcount overflow at "
				  "chunk %lu for region 0x%lx+0x%lx\n",
				  bit, addr, size);
			BUG("shared region refcount overflow");
		}
		shared_region_refcount[bit]++;
		shared_bitmap_set(bit);
	}
}

/*
 * Inverse of shared_bitmap_mark().  Decrements the per-chunk refcount
 * for every 2 MiB chunk the range spans and clears the bitmap bit only
 * once the chunk's last tracked region is gone.  Called from
 * untrack_shared_region() after a matching shared_regions[] slot has
 * been located, so an inconsistency (refcount==0 on a chunk the caller
 * believes it tracked) is a tree-state bug worth BUG()ing on rather
 * than silently masking -- a stuck bit with refcount==0 would falsely
 * reject every fuzzed mm syscall touching the chunk forever.  An
 * out-of-span unmark cannot occur in practice because shared_bitmap_
 * mark() BUG()s on out-of-span marks, so the symmetric guard here is
 * defence-in-depth, not a reachable path.
 */
static void shared_bitmap_unmark(unsigned long addr, unsigned long size)
{
	unsigned long end, first, last, bit;

	if (size == 0)
		return;

	if (addr >= SHARED_BITMAP_VA_SPAN ||
	    size > SHARED_BITMAP_VA_SPAN - addr) {
		outputerr("shared_bitmap_unmark: region 0x%lx+0x%lx outside "
			  "1<<%lu user VA span\n",
			  addr, size, SHARED_BITMAP_VA_LOG2);
		BUG("shared region outside bitmap span");
	}

	end = addr + size - 1;
	first = addr >> SHARED_BITMAP_GRANULARITY_LOG2;
	last = end >> SHARED_BITMAP_GRANULARITY_LOG2;

	for (bit = first; bit <= last; bit++) {
		if (shared_region_refcount[bit] == 0) {
			outputerr("shared_bitmap_unmark: refcount underflow at "
				  "chunk %lu for region 0x%lx+0x%lx\n",
				  bit, addr, size);
			BUG("shared region refcount underflow");
		}
		if (--shared_region_refcount[bit] == 0)
			shared_bitmap_clear(bit);
	}
}

/*
 * Size-bucket bitmap accelerator for range_overlaps_shared(): companion
 * to the address-keyed shared_region_bitmap above.  Bit i is set
 * whenever at least one tracked shared region currently falls into
 * size bucket i, where bucket i = floor(log2(len)) and covers regions
 * of len in [2^i, 2^(i+1)).  An empty bitmap (no tracked region of any
 * size) is the useful negative the address bitmap has to discover one
 * word at a time: one load here short-circuits the SHARED_BITMAP_NWORDS
 * word-scan over a multi-MiB query, plus the downstream byte-precise
 * walk that confirms a bitmap hit.
 *
 * Distinct concern from shared_region_bitmap above.  That bitmap
 * encodes WHERE tracked regions live (one bit per 2 MiB chunk of user
 * VA); this one encodes only WHETHER any tracked region exists in each
 * size class.  The two are wired in pairs: every register
 * (alloc_shared, track_shared_region, register_shared_overflow) calls
 * shared_bitmap_mark() AND tracked_size_mark(); every untrack (the
 * regular slot AND the overflow tail path in untrack_shared_region)
 * calls shared_bitmap_unmark() AND tracked_size_unmark().  Forgetting
 * the parallel call in a future refactor flips the size bitmap's
 * safety invariant from "empty ⇒ provably no regions" to "empty ⇒
 * silently under-reject"; shared_bitmap_self_check() asserts the
 * positive-path wiring at startup so that class of bug fails loudly.
 *
 * 64 buckets is the natural cap: a single unsigned long stores the
 * whole bitmap, and SHARED_BITMAP_VA_SPAN = 1<<47 bounds the largest
 * possible region at bucket 47 anyway -- buckets 48..63 stay zero on
 * any legitimate registration.  Per-bucket uint16_t refcount keeps the
 * bit set until the LAST region in that size class drops, mirroring
 * the shared_region_refcount discipline on the address bitmap; the
 * 4352-region worst case (MAX_SHARED_ALLOCS + SHARED_REGIONS_OVERFLOW_
 * TAIL) sits comfortably under UINT16_MAX, so a pathological run that
 * crowds every region into one bucket cannot overflow the counter.
 *
 * size==0 is a no-op for the same reason shared_bitmap_mark() no-ops
 * on size==0: the registering caller treats a zero-byte region as "no
 * region" and floor(log2(0)) is undefined, so suppressing the bump
 * here keeps the bitmaps in lockstep and avoids a spurious bucket-0
 * entry that no matching untrack would ever clear.
 */
#define TRACKED_SIZE_NBUCKETS	64
static unsigned long tracked_size_bm;
static uint16_t tracked_size_bucket_count[TRACKED_SIZE_NBUCKETS];

static inline unsigned int tracked_size_bucket(unsigned long len)
{
	return 63u - (unsigned int)__builtin_clzl(len);
}

static void tracked_size_mark(unsigned long len)
{
	unsigned int b;

	if (len == 0)
		return;

	b = tracked_size_bucket(len);
	if (b >= TRACKED_SIZE_NBUCKETS) {
		outputerr("tracked_size_mark: bucket %u out of range for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket out of range");
	}
	if (tracked_size_bucket_count[b] == UINT16_MAX) {
		outputerr("tracked_size_mark: bucket %u refcount overflow for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket refcount overflow");
	}
	if (tracked_size_bucket_count[b]++ == 0)
		tracked_size_bm |= 1UL << b;
}

static void tracked_size_unmark(unsigned long len)
{
	unsigned int b;

	if (len == 0)
		return;

	b = tracked_size_bucket(len);
	if (b >= TRACKED_SIZE_NBUCKETS) {
		outputerr("tracked_size_unmark: bucket %u out of range for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket out of range");
	}
	if (tracked_size_bucket_count[b] == 0) {
		outputerr("tracked_size_unmark: bucket %u refcount underflow for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket refcount underflow");
	}
	if (--tracked_size_bucket_count[b] == 0)
		tracked_size_bm &= ~(1UL << b);
}

/*
 * Handle a registration that arrived once shared_regions[] is full.
 *
 * The previous "warn once, then silently drop the region" policy turned
 * an over-budget host into the exact failure mode this whole tracker
 * exists to prevent: range_overlaps_shared() can no longer guard an
 * untracked writable MAP_SHARED region from a fuzzed
 * munmap/mremap/madvise/mprotect, so the next call that picks an
 * unlucky address scribbles trinity's own shared state and the
 * resulting crash looks like a kernel bug.  Silent under-protection of
 * a writable shared mapping is never preferable to a loud abort.
 *
 * New policy, per call:
 *
 *   - Always emit a LOUD outputerr() naming the caller PC (resolved via
 *     pc_to_string, same idiom as log_mprotect_failure()), the offending
 *     region, and the tail occupancy.  Per-call (not cap-once): the
 *     cap-once predecessor hid how badly the cap was over budget, which
 *     is the one piece of data needed to size a real fix.
 *
 *   - Under ASAN (the developer / debug build), BUG() immediately --
 *     overflow is a tree-state bug and we want a stack trace, not a
 *     production-shaped degradation.
 *
 *   - In release, register the region in the bounded overflow tail so
 *     the bitmap stays correct (shared_bitmap_mark already covers the
 *     range) and range_in_tracked_shared() can still match precisely.
 *     Bump shm->stats.shared_region_overflow so the over-budget state
 *     is visible in the periodic stats dump.
 *
 *   - If the overflow tail itself fills, BUG() in both debug and
 *     release.  Two layers of bounded storage is enough; a third would
 *     just be a slower path to the same silent-under-protection bug.
 */
static void register_shared_overflow(const char *who, unsigned long addr,
				     unsigned long size,
#ifdef CONFIG_GUARD_SHARED
				     bool guarded,
#endif
				     void *caller)
{
	char pcbuf[128];

	outputerr("shared_regions: %s overflow: region 0x%lx+0x%lx from %s; "
		  "MAX_SHARED_ALLOCS=%d exhausted, overflow tail at %u/%d -- "
		  "raise the cap or move shared_regions[] to dynamic resize\n",
		  who, addr, size,
		  pc_to_string(caller, pcbuf, sizeof(pcbuf)),
		  MAX_SHARED_ALLOCS,
		  nr_shared_regions_overflow, SHARED_REGIONS_OVERFLOW_TAIL);

#ifdef __SANITIZE_ADDRESS__
	BUG("shared_regions[] overflow (debug build)");
#else
	if (nr_shared_regions_overflow >= SHARED_REGIONS_OVERFLOW_TAIL) {
		outputerr("shared_regions: overflow tail also exhausted "
			  "(%d slots); refusing to leave region 0x%lx+0x%lx "
			  "untracked\n",
			  SHARED_REGIONS_OVERFLOW_TAIL, addr, size);
		BUG("shared_regions overflow tail exhausted");
	}

	shared_regions_overflow[nr_shared_regions_overflow].addr = addr;
	shared_regions_overflow[nr_shared_regions_overflow].size = size;
#ifdef CONFIG_GUARD_SHARED
	shared_regions_overflow[nr_shared_regions_overflow].guarded =
		guarded ? 1 : 0;
#endif
	shared_bitmap_mark(addr, size);
	tracked_size_mark(size);
	nr_shared_regions_overflow++;

	if (shm != NULL)
		__atomic_add_fetch(&shm->stats.shared_region_overflow, 1,
				   __ATOMIC_RELAXED);
#endif
}

#ifdef CONFIG_GUARD_SHARED
/*
 * Round len up to the nearest page boundary.  page_size is populated by
 * init_main_process() before parse_args() and any alloc_shared() caller,
 * so it is always non-zero by the time this is reachable.
 */
static size_t guard_pages_round_up(size_t len)
{
	size_t ps = (size_t)page_size;

	return (len + ps - 1) & ~(ps - 1);
}

/*
 * Recover (base, span) from the inner pointer + size of a guarded
 * region.  __alloc_shared() lays out a guarded mapping as
 *
 *   | leading guard (1 page) | unused fold | inner buffer | trailing guard (1 page) |
 *   ^base                     ^base+PAGE    ^ret           ^base+PAGE+pages
 *
 * with pages = round_up(size, page_size) and the inner buffer end-
 * aligned against the trailing guard so a forward overflow (buf[size])
 * traps at byte granularity.  Inverting:
 *
 *   pages = round_up(size, page_size)
 *   base  = ret - PAGE - (pages - size)
 *   span  = PAGE + pages + PAGE
 *
 * The size is stored in shared_regions[].size and the guarded bit is
 * stored alongside, so free_shared() needs no parallel side table to
 * unwind the layout.
 */
static void guard_pages_derive_span(void *ret, size_t size,
				    void **base_out, size_t *span_out)
{
	size_t ps = (size_t)page_size;
	size_t pages = guard_pages_round_up(size);
	char *base = (char *)ret - ps - (pages - size);

	*base_out = base;
	*span_out = ps + pages + ps;
}

/*
 * Mmap a guarded region: one VA span = leading-guard + usable-pages +
 * trailing-guard, with the inner buffer end-aligned against the
 * trailing guard.  Returns the inner pointer (the address callers see
 * and store in shared_regions[]), or MAP_FAILED on failure.  On
 * failure logs a single outputerr() line and leaves no VMA behind:
 * the leading-guard mprotect is reverted by munmap before return so
 * the caller can fall back to a non-guarded mmap without leaking VA.
 */
static void *guard_pages_alloc(size_t size)
{
	size_t ps = (size_t)page_size;
	size_t pages = guard_pages_round_up(size);
	size_t span = ps + pages + ps;
	char *base;

	base = mmap(NULL, span, PROT_READ | PROT_WRITE,
		    MAP_ANON | MAP_SHARED, -1, 0);
	if (base == MAP_FAILED) {
		outputerr("guard_pages_alloc: mmap %zu failure (span=%zu)\n",
			  size, span);
		return MAP_FAILED;
	}

	/* Drop the leading and trailing pages to PROT_NONE so any
	 * adjacent overflow traps in copy_*_user (kernel-side) or
	 * directly at the writer PC (userspace).  Splits the span into
	 * three VMAs (guard / usable / guard); the cost is +2 VMAs per
	 * guarded region.
	 *
	 * Both mprotects run once per guarded region at setup time
	 * (alloc_shared is called from init paths, not from the arg-gen
	 * hot loop), so the slow-path checker's blanket ban does not
	 * apply -- mark explicitly to keep the surface honest.
	 */
	/* check-static: slow-ok */
	if (mprotect(base, ps, PROT_NONE) != 0) {
		outputerr("guard_pages_alloc: mprotect(leading) failed: errno=%d\n",
			  errno);
		(void)munmap(base, span);
		return MAP_FAILED;
	}
	/* check-static: slow-ok */
	if (mprotect(base + ps + pages, ps, PROT_NONE) != 0) {
		outputerr("guard_pages_alloc: mprotect(trailing) failed: errno=%d\n",
			  errno);
		(void)munmap(base, span);
		return MAP_FAILED;
	}

	/* End-align the inner buffer against the trailing guard so a
	 * forward overflow at byte granularity (buf[size] = x) faults
	 * at the writer PC instead of corrupting the fold region. */
	return base + ps + (pages - size);
}

/*
 * Decide whether this allocation falls into the current guard scope.
 * GUARD_SCOPE_OFF gates everything off (legacy fast path).
 * GUARD_SCOPE_POOLS guards only is_pool=true alloc sites (kcov_shm,
 * shared str heap, childdata -- the long-lived regions the corruption
 * clusters keep pointing at).  GUARD_SCOPE_ALL guards every site.
 */
static bool guard_scope_covers(bool is_pool)
{
	switch (guard_shared_scope) {
	case GUARD_SCOPE_ALL:
		return true;
	case GUARD_SCOPE_POOLS:
		return is_pool;
	case GUARD_SCOPE_OFF:
	default:
		return false;
	}
}

/*
 * Classify a fault address against the guarded regions tracked in
 * shared_regions[].  Returns true and fills outs when @fault_addr
 * lands in either the leading or trailing PROT_NONE page abutting a
 * guarded region; false otherwise.
 *
 * Called from child_fault_handler() on every fatal-signal delivery
 * before the in-handler diagnostic path runs, so this MUST be async-
 * signal-safe: plain reads of file-scope arrays only -- no allocator,
 * no stdio, no lock, no libc call outside the POSIX 2024 sec 2.4.3 set.
 * shared_regions[] is published once at init time (single-threaded
 * parent context) and never mutated past first fork, so a child
 * handler observing it sees a stable snapshot.  The page_size global
 * is set in init_main_process(), also before any fork.
 *
 * @delta_out is the byte-distance from the fault address to the
 * nearest legitimate edge of the region: how far past the end for a
 * trailing-guard fault (fault_addr - region_end), or how far before
 * the start for a leading-guard fault (region_start - fault_addr - 1
 * mapped through 0).  Bounded by page_size by construction.
 */
bool guard_pages_classify(uintptr_t fault_addr,
			  uintptr_t *region_addr_out,
			  size_t *region_size_out,
			  bool *trailing_out,
			  unsigned long *delta_out)
{
	uintptr_t ps = (uintptr_t)page_size;
	uintptr_t leading_start, trailing_start;
	unsigned long pages;
	unsigned int i;

	if (ps == 0)
		return false;

	for (i = 0; i < nr_shared_regions; i++) {
		if (shared_regions[i].guarded == 0)
			continue;

		pages = (shared_regions[i].size + ps - 1) & ~(ps - 1);
		leading_start = shared_regions[i].addr - ps -
				(pages - shared_regions[i].size);
		trailing_start = shared_regions[i].addr +
				 shared_regions[i].size;
		/* trailing guard sits at base+PAGE+pages == addr+size +
		 * (pages - size); collapse via the layout invariant. */
		trailing_start = leading_start + ps + pages;

		if (fault_addr >= leading_start &&
		    fault_addr < leading_start + ps) {
			*region_addr_out = shared_regions[i].addr;
			*region_size_out = shared_regions[i].size;
			*trailing_out = false;
			*delta_out = (unsigned long)
				(shared_regions[i].addr - fault_addr);
			return true;
		}
		if (fault_addr >= trailing_start &&
		    fault_addr < trailing_start + ps) {
			*region_addr_out = shared_regions[i].addr;
			*region_size_out = shared_regions[i].size;
			*trailing_out = true;
			*delta_out = (unsigned long)
				(fault_addr -
				 (shared_regions[i].addr +
				  shared_regions[i].size));
			return true;
		}
	}

	for (i = 0; i < nr_shared_regions_overflow; i++) {
		if (shared_regions_overflow[i].guarded == 0)
			continue;

		pages = (shared_regions_overflow[i].size + ps - 1) & ~(ps - 1);
		leading_start = shared_regions_overflow[i].addr - ps -
				(pages - shared_regions_overflow[i].size);
		trailing_start = leading_start + ps + pages;

		if (fault_addr >= leading_start &&
		    fault_addr < leading_start + ps) {
			*region_addr_out = shared_regions_overflow[i].addr;
			*region_size_out = shared_regions_overflow[i].size;
			*trailing_out = false;
			*delta_out = (unsigned long)
				(shared_regions_overflow[i].addr - fault_addr);
			return true;
		}
		if (fault_addr >= trailing_start &&
		    fault_addr < trailing_start + ps) {
			*region_addr_out = shared_regions_overflow[i].addr;
			*region_size_out = shared_regions_overflow[i].size;
			*trailing_out = true;
			*delta_out = (unsigned long)
				(fault_addr -
				 (shared_regions_overflow[i].addr +
				  shared_regions_overflow[i].size));
			return true;
		}
	}

	return false;
}
#endif	/* CONFIG_GUARD_SHARED */

#ifdef CONFIG_GUARD_SHARED
/*
 * Primary shared-region allocator under CONFIG_GUARD_SHARED.  is_pool
 * tags long-lived regions (kcov_shm, shared str heap, childdata) so
 * --guard-shared=pools picks them up without dragging every per-child
 * tiny alloc into the VMA budget.  alloc_shared() below is the no-pool
 * entry point most call sites use; the three pool sites call
 * alloc_shared_pool() (a thin wrapper) which routes here with
 * is_pool=true.
 *
 * Behaviour matrix:
 *
 *   scope == OFF             -> single-mmap path (one runtime branch,
 *                               no extra syscalls).
 *   scope covers is_pool     -> guarded layout from guard_pages_alloc();
 *                               a guard-alloc failure logs and falls
 *                               back to the non-guarded path so the
 *                               run continues.
 *
 * Either path registers the INNER (ret, size) with shared_regions[] and
 * the bitmap; the guard pages are deliberately NOT tracked so the mm-
 * syscall sanitisers don't reject fuzzed calls against unrelated VA
 * that happens to share a 2 MiB bitmap chunk with a guard.  free_shared
 * inverts the layout via the guarded flag stored alongside.
 */
void * __alloc_shared(size_t size, bool is_pool)
{
	void *ret;
	bool guarded = false;

	if (guard_scope_covers(is_pool)) {
		ret = guard_pages_alloc(size);
		if (ret != MAP_FAILED)
			guarded = true;
	} else {
		ret = MAP_FAILED;
	}

	if (ret == MAP_FAILED) {
		ret = mmap(NULL, size, PROT_READ | PROT_WRITE,
			   MAP_ANON | MAP_SHARED, -1, 0);
	}
	if (ret == MAP_FAILED) {
		outputerr("mmap %zu failure\n", size);
		exit(EXIT_FAILURE);
	}
	/* poison with independently-random bytes to expose uninitialized reads. */
	{
		unsigned char *p = ret;
		size_t i;

		for (i = 0; i + sizeof(unsigned int) <= size; i += sizeof(unsigned int)) {
			unsigned int r = rnd_u32();
			memcpy(p + i, &r, sizeof(r));
		}
		for (; i < size; i++)
			p[i] = (unsigned char)rnd_u32();
	}

	if (nr_shared_regions < MAX_SHARED_ALLOCS) {
		shared_regions[nr_shared_regions].addr = (unsigned long) ret;
		shared_regions[nr_shared_regions].size = size;
		shared_regions[nr_shared_regions].guarded = guarded ? 1 : 0;
		shared_bitmap_mark((unsigned long) ret, size);
		tracked_size_mark(size);
		nr_shared_regions++;
	} else {
		register_shared_overflow("alloc_shared", (unsigned long) ret,
					 size, guarded,
					 __builtin_return_address(0));
	}

	return ret;
}

void * alloc_shared(size_t size)
{
	return __alloc_shared(size, false);
}

void * alloc_shared_pool(size_t size)
{
	return __alloc_shared(size, true);
}

/*
 * Inverse of __alloc_shared().  Removes the matching shared_regions[]
 * slot, then munmaps either the full guarded span (PAGE + pages + PAGE
 * derived from the stored size+guarded flag) or the legacy (ret, size)
 * range.  No current alloc_shared caller has a destructor -- all pool
 * regions live for the parent's lifetime -- but the symmetry is the
 * spec contract for free-path correctness, and a future caller that
 * needs to release a pool region (test harness, lifecycle rework) must
 * route through here so the guard VMAs are not leaked behind.  Misses
 * silently to match untrack_shared_region()'s tolerance for callers
 * whose alloc was a no-op (size==0) or whose addr+size pair never
 * matched a registered slot exactly.
 */
void free_shared(void *p, size_t size)
{
	void *base = p;
	size_t span = size;
	bool guarded = false;
	unsigned int i;

	if (p == NULL)
		return;

	for (i = 0; i < nr_shared_regions; i++) {
		if (shared_regions[i].addr != (unsigned long)p ||
		    shared_regions[i].size != size)
			continue;
		guarded = shared_regions[i].guarded != 0;
		break;
	}
	if (i == nr_shared_regions) {
		for (i = 0; i < nr_shared_regions_overflow; i++) {
			if (shared_regions_overflow[i].addr != (unsigned long)p ||
			    shared_regions_overflow[i].size != size)
				continue;
			guarded = shared_regions_overflow[i].guarded != 0;
			break;
		}
	}

	untrack_shared_region((unsigned long)p, size);

	if (guarded)
		guard_pages_derive_span(p, size, &base, &span);

	if (munmap(base, span) != 0)
		outputerr("free_shared: munmap(%p, %zu) failed: errno=%d\n",
			  base, span, errno);
}

#else	/* !CONFIG_GUARD_SHARED */

/*
 * Legacy single-mmap path.  Byte-identical to pre-guard-armor trinity.
 */
void * alloc_shared(size_t size)
{
	void *ret;

	ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
	if (ret == MAP_FAILED) {
		outputerr("mmap %zu failure\n", size);
		exit(EXIT_FAILURE);
	}
	/* poison with independently-random bytes to expose uninitialized reads. */
	{
		unsigned char *p = ret;
		size_t i;

		for (i = 0; i + sizeof(unsigned int) <= size; i += sizeof(unsigned int)) {
			unsigned int r = rnd_u32();
			memcpy(p + i, &r, sizeof(r));
		}
		for (; i < size; i++)
			p[i] = (unsigned char)rnd_u32();
	}

	if (nr_shared_regions < MAX_SHARED_ALLOCS) {
		shared_regions[nr_shared_regions].addr = (unsigned long) ret;
		shared_regions[nr_shared_regions].size = size;
		shared_bitmap_mark((unsigned long) ret, size);
		tracked_size_mark(size);
		nr_shared_regions++;
	} else {
		register_shared_overflow("alloc_shared", (unsigned long) ret,
					 size, __builtin_return_address(0));
	}

	return ret;
}

#endif	/* CONFIG_GUARD_SHARED */

/*
 * Add an externally-mmap'd region to the shared_regions tracker so the
 * range_overlaps_shared() guards in the mm-syscall sanitisers refuse
 * fuzzed munmap/mremap/madvise/mprotect calls that target it.  Used by
 * code that mmaps via something other than alloc_shared() and still
 * needs the region protected from the fuzzer -- e.g., the per-child
 * kcov ring buffer mapped from /sys/kernel/debug/kcov.
 */
void track_shared_region(unsigned long addr, unsigned long size)
{
	if (nr_shared_regions < MAX_SHARED_ALLOCS) {
		shared_regions[nr_shared_regions].addr = addr;
		shared_regions[nr_shared_regions].size = size;
#ifdef CONFIG_GUARD_SHARED
		/* Externally-mmap'd, never guarded by __alloc_shared. */
		shared_regions[nr_shared_regions].guarded = 0;
#endif
		shared_bitmap_mark(addr, size);
		tracked_size_mark(size);
		nr_shared_regions++;
	} else {
		register_shared_overflow("track_shared_region", addr, size,
#ifdef CONFIG_GUARD_SHARED
					 false,
#endif
					 __builtin_return_address(0));
	}
}

/*
 * Inverse of track_shared_region() / alloc_shared() registration.
 * Removes the matching shared_regions[] entry (exact addr+size match)
 * and undoes the bitmap refcount/bit it contributed, so providers that
 * munmap their region on destructor (io_uring rings, kvm vCPU run
 * pages) stop accumulating stale slots and stop holding the bitmap bit
 * set after their VA has been recycled to something unrelated.
 *
 * Slot reuse uses swap-with-last compaction: the freed slot inherits
 * the array tail, nr_shared_regions decrements.  Nothing depends on
 * shared_regions[] order beyond shared_bitmap_self_check() peeking at
 * slot 0, and that runs once at init -- well before any destructor can
 * fire -- so the order disturbance is invisible to live code paths
 * (range_overlaps_shared and range_in_tracked_shared both walk the
 * whole array).
 *
 * Walks the overflow tail too: a provider whose registration was
 * parked there is no less tracked from the caller's perspective and
 * must be unregistered the same way; otherwise the tail would only
 * ever grow.
 *
 * A miss returns silently rather than BUG()ing: a caller may
 * legitimately untrack a region whose original track call was a no-op
 * (e.g. size==0), or whose addr+size pair doesn't exactly match a
 * registration (the slot allocator is exact-match only).  Silent miss
 * is the same shape as Linux's __ClearPageReserved on a non-Reserved
 * page -- the inverse of a "best effort" registration is best effort.
 */
void untrack_shared_region(unsigned long addr, unsigned long size)
{
	unsigned int i;

	for (i = 0; i < nr_shared_regions; i++) {
		if (shared_regions[i].addr != addr ||
		    shared_regions[i].size != size)
			continue;
		shared_bitmap_unmark(addr, size);
		tracked_size_unmark(size);
		shared_regions[i] = shared_regions[nr_shared_regions - 1];
		nr_shared_regions--;
		return;
	}

	for (i = 0; i < nr_shared_regions_overflow; i++) {
		if (shared_regions_overflow[i].addr != addr ||
		    shared_regions_overflow[i].size != size)
			continue;
		shared_bitmap_unmark(addr, size);
		tracked_size_unmark(size);
		shared_regions_overflow[i] =
			shared_regions_overflow[nr_shared_regions_overflow - 1];
		nr_shared_regions_overflow--;
		return;
	}
}

bool shared_size_mul(size_t a, size_t b, size_t *out)
{
	return !__builtin_mul_overflow(a, b, out);
}

/*
 * Size-bucketed freelist for shared heap recycling.
 *
 * Eight fixed-size buckets cover the common allocation sizes.  A freed slot
 * whose aligned size falls within a bucket is pushed onto that bucket's
 * lock-free stack; the next alloc of the same size pops it instead of
 * burning new bump space.  Allocations larger than 1024 bytes bypass the
 * freelist and use the bump allocator directly (documented below).
 *
 * The freelist link lives in the slot's own first sizeof(uintptr_t) bytes.
 * This is safe because the slot is not live when the link is written: the
 * caller has just handed it back to us, and we zero the rest of the slot
 * before writing the link so that a use-after-free still surfaces as zero-
 * byte reads rather than as a stale link pointer.
 *
 * CAS ordering: RELAXED is sufficient for the same reason as the bump
 * cursor — the caller publishes the resulting object via add_object()'s
 * RELEASE store, which is the actual synchronisation point for consumers.
 *
 * ABA safety via tagged pointer.  A naive lock-free stack with a single
 * pointer head is vulnerable to the classic ABA race: a popper reads
 * old_head=X and next=*X, but before its CAS another thread pops X, pops
 * X.next, and then pushes X back.  The CAS still sees head==X and succeeds,
 * but it installs the stale "next" value, leaving head pointing at a slot
 * that has already been handed back to a caller.  Two callers then think
 * they own the same slot; the resulting double-use corrupts whichever
 * obj struct was layered over the slot and faults later in unrelated
 * code paths far from the buggy free.
 *
 * The mitigation is a 16-bit version counter packed into the high bits of
 * the head word.  Each push and pop increments the version; the CAS
 * compares the full 64-bit (version, ptr) tuple.  The A→B→A sequence above
 * now leaves the head as (X, ver+2) rather than (X, ver), so the racer's
 * CAS fails on the version mismatch and it retries with a fresh load.
 *
 * The packing exploits the canonical-form invariant of x86_64 user-space
 * virtual addresses: only bits 0-47 are significant, and bit 47 is 0 for
 * any user-space pointer (kernel pointers have bit 47 == 1 and are
 * sign-extended into the upper 16 bits).  We therefore stash the version
 * counter in bits 48-63, recover the pointer with a 48-bit mask, and need
 * no sign extension on read.  The slot's stored "next" link is just the
 * raw pointer (no version bits) — the version lives only in the head.
 *
 * The 16-bit version is finite: a perfectly-timed sequence of exactly
 * 65536 push/pop pairs in the gap between a victim's load and CAS would
 * wrap the version back to its original value and re-expose the race.
 * For a process-bounded fuzzer with sub-microsecond critical sections
 * this is astronomically improbable; if it ever proves observable the
 * head can be widened to a 128-bit (ptr, version) tuple and switched to a
 * cmpxchg16b-based DWCAS without any caller change.
 */

/*
 * The packed (ptr, version) freelist head assumes the top 16 bits of every
 * freelist pointer are zero — i.e. a 48-bit canonical userspace VA range,
 * which is the x86-64 default and is not guaranteed on arm64 (52-bit
 * possible), s390x, riscv, or x86-64 with 5-level paging enabled.  Reject
 * the build explicitly so a future port hits the wall here instead of