perfect/perfect-zen2/src/bin/btb.rs at main · eigenform/perfect · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
use perfect::*;
use perfect::events::*;
use perfect::stats::*;
use perfect::ir::branch::*;
use perfect::util::*;
use itertools::*;
use std::collections::*;
use perfect::uarch::btb::*;

const KIND: [IndBrn; 4] = [
    IndBrn::CallMem, IndBrn::CallReg, IndBrn::JmpMem, IndBrn::JmpReg
];

/// Generate a random virtual address.
///
/// NOTE: The offset 0x80 is just a hack to make sure our emitters have
/// allocated enough space for us to put a short prologue.
/// This doesn't ("shouldn't") have any bearing on the test.
fn generate_random_addr() -> usize {
    const USERSPACE_RANGE: std::ops::Range<usize> = {
        0x0000_0000_0000..0x0000_7fff_ffff_ffff
    };
    let mut rng = thread_rng();
    (rng.gen_range(USERSPACE_RANGE) & !0xff) + 0x80
}


fn main() {
    PerfectEnv::pin_to_core(15);
    let mut rng = thread_rng();

    // Test different indirect branches.
    for brn_kind in KIND {
        let arg_aliasing = BTBAliasArgs {
            desc: "aliasing",
            victim_addr: generate_random_addr(),
            attacker_addr: None,
            kind: brn_kind,
        };

        let arg_nonaliasing = BTBAliasArgs {
            desc: "non-aliasing",
            victim_addr: generate_random_addr(),
            attacker_addr: Some(generate_random_addr()),
            kind: brn_kind,
        };

        BTBAliasing::run(arg_nonaliasing);
        BTBAliasing::run(arg_aliasing);
    }
}


#[derive(Clone, Copy, Debug)]
pub enum IndBrn {
    /// CALL (register operand)
    CallReg,
    /// CALL (memory operand)
    CallMem,
    /// JMP (register operand)
    JmpReg,
    /// JMP (memory operand)
    JmpMem,
}

pub struct BTBAliasTest {
    /// Containing the victim indirect branch
    victim: X64AssemblerFixed,
    /// Containing the attacker's indirect branch
    attacker: X64AssemblerFixed,

    /// Target for the victim's indirect branch
    victim_tgt: X64AssemblerFixed,
    /// Target for the attackers's indirect branch
    attacker_tgt: X64AssemblerFixed,
}

pub struct BTBAliasArgs {
    desc: &'static str,
    /// Victim indirect branch address
    victim_addr: usize,
    /// Attacker indirect branch address
    attacker_addr: Option<usize>,
    /// Branch type/encoding
    kind: IndBrn,
}


/// Demonstrate the effects of aliasing BTB entries.
///
/// Context
/// =======
///
/// A "branch target buffer" (BTB) is a structure that maintains information
/// about previously-resolved branches and their target addresses.
///
/// When encountering a branch in the instruction stream, modern machines
/// attempt to predict the target address by trying to find a BTB entry for
/// the branch. In order to distinguish a particular branch, implementations
/// typically mix together the current program counter with information about
/// recent control-flow (ie. the addresses and/or targets of previous
/// recently-taken branches). The resulting bits are then used as an index to
/// access the BTB.
///
/// In general, it's possible that an implementation *cannot* distinguish
/// between certain branches, and that distinct branches may be aliasing
/// with one another in the BTB.
///
/// "Branch Target Injection" (BTI)
/// ===============================
///
/// This is the idea behind "branch target injection" (BTI) attacks
/// (which are sometimes also called "Spectre Variant 2").
///
/// If an attacker can intentionally create a branch which is aliasing in the
/// BTB with a victim branch, the attacker may be able to cause the victim
/// branch to suffer mispredictions. Since the attacker controls the target
/// address of the aliasing branch, the victim branch may suffer incorrect
/// speculation at a target chosen by the attacker.
///
/// Creating Aliasing BTB Entries
/// =============================
///
/// In our case, the RETBLEED paper[^1] describes the hash function for Zen 2
/// and Zen 3 parts. This uses the program counter of the branch to create a
/// 12-bit BTB index (see [`perfect::uarch::btb::ZEN2_BTB_INDEX_FN`]).
///
/// [^1]: [RETBLEED: Arbitrary Speculative Code Execution with Return Instructions](https://comsec.ethz.ch/wp-content/files/retbleed_sec22.pdf)
///
/// Test
/// ====
///
/// This setup consists of the following parts:
///
/// - A "victim" indirect branch, and the victim's target
/// - An "attacker" indirect branch, and the attacker's target
/// - A "probe" cacheline used to determine whether or not the victim's
///   indirect branch was mispredicted
///
/// In order to distinguish cases where a misprediction occurs, we rely on the
/// fact that a load instruction in the attacker's target may affect the state
/// of the L1D cache.
///
/// In this case, [`Self::PROBE_ADDR`] is the address of a cacheline used to
/// make the distinction. This address is propagated into the victim's target,
/// which *does not* perform any load. After the test, we measure a load to
/// determine whether or not incorrect speculation occurred.
///
/// 1. Train the predictor by performing the attacker's indirect branch with
///    the attacker's target (while avoiding a load to the probe cacheline).
///
/// 2. Explicitly flush the probe cacheline.
///
/// 3. Perform the victim's indirect branch with the victim's target
///    (while passing the address of the probe cacheline).
///
/// 4. Use RDPRU and APERF to measure a load from the probe cacheline.
///
/// If we observe that the load to the probe cacheline completed quickly, we
/// expect that the following has occurred:
///
/// 1. The victim's branch was mispredicted using the BTB entry associated
///    with the attacker's branch, and incorrect speculation occured at
///    'attacker_tgt'
///
/// 2. The address of the probe cacheline passed to the victim's branch was
///    propagated to the load instruction in the attacker's target.
///
/// 3. The probe cacheline was [speculatively, incorrectly] brought into the
///    L1D cache before the branch misprediction was resolved.
///
pub struct BTBAliasing;
impl BTBAliasing {
    /// Number of test iterations
    const ITERS: usize = 32;

    /// Address of the probed cacheline
    const PROBE_ADDR: usize = 0x1234_1234_1000;

    /// Scratch memory for saving a branch target
    const TGT_STORAGE: usize = Self::PROBE_ADDR + 0x4c0;

    /// Scratch memory for stack pointer
    const RSP_STORAGE: usize = Self::PROBE_ADDR + 0x880;

    /// Generate a virtual address whose BTB index is colliding with the
    /// BTB index for virtual address `addr`.
    fn generate_collision_for(addr: usize) -> usize {
        let mut rng = thread_rng();
        let coll = zen2_btb_collisions(addr, 1)
            .into_iter()
            .filter(|x| {
                *x < 0x7fff_ffff_ffffusize &&
                *x != addr
            })
            .collect_vec();
        let alias = coll.choose(&mut rng).unwrap().clone();
        alias
    }

    /// Flush the probe cacheline at [`Self::PROBE_ADDR`].
    #[inline(always)]
    fn flush_probe() {
        unsafe {
            let ptr = Self::PROBE_ADDR as *mut u8;
            core::arch::x86_64::_mm_clflush(ptr);
            core::arch::x86_64::_mm_mfence();
            core::arch::x86_64::_mm_lfence();
        }
    }

    /// Emit some kind of indirect branch at virtual address `addr`.
    ///
    /// NOTE: We're using this to emit both the victim/attacker branches.
    fn emit_indir_brn(addr: usize, kind: IndBrn) -> X64AssemblerFixed {
        let base = addr & 0xffff_ffff_ffff_e000;
        let mut f = X64AssemblerFixed::new(
            base,
            0x0000_0000_0000_4000,
        );
        let exit = f.new_dynamic_label();

        f.emit_push_nonvolatile_gprs();
        dynasm!(f
            ; mov rax, QWORD Self::RSP_STORAGE as i64
            ; mov QWORD [rax], rsp
            ; lea r14, [=>exit]
            ; xor r15, r15
        );

        // If we're using a memory operand, use a non-temporal store to
        // write the target address into memory somewhere.
        // This should expose the branch to extreme latency.
        match kind {
            IndBrn::CallMem | IndBrn::JmpMem => {
                dynasm!(f
                    ; mov r15, QWORD Self::TGT_STORAGE as i64
                    ; movnti QWORD [r15], rdi
                );
            },
            _ => {},
        }

        // Pad to the requested address, emit the requested indirect branch
        f.emit_lfence();
        f.pad_until(addr);
        match kind {
            IndBrn::JmpMem  => dynasm!(f ; jmp QWORD [r15]),
            IndBrn::CallMem => dynasm!(f ; call QWORD [r15]),
            IndBrn::JmpReg  => dynasm!(f ; jmp rdi),
            IndBrn::CallReg => dynasm!(f ; call rdi),
            _ => unimplemented!(),
        }
        f.emit_lfence();

        // The target of our indirect branch is expected to jump back here
        // using the value in r14.
        dynasm!(f
            ; .align 64
            ; =>exit
            ; lfence
        );

        // Restore stack pointer and registers, return to caller
        dynasm!(f
            ; mov rax, QWORD Self::RSP_STORAGE as i64
            ; mov rsp, QWORD [rax]
        );
        f.emit_pop_nonvolatile_gprs();
        f.emit_ret();
        f.emit_lfence();
        f.commit().unwrap();
        f
    }

    /// Emit the "victim" target function.
    fn emit_victim_target(addr: usize) -> X64AssemblerFixed {
        let mut f = X64AssemblerFixed::new(
            addr,
            0x0000_0000_0000_4000
        );
        f.emit_lfence();
        dynasm!(f
            ; .align 64
            ; jmp r14
            ; lfence
        );
        f.commit().unwrap();
        f
    }

    /// Emit the "attacker" target function.
    fn emit_attacker_target(addr: usize) -> X64AssemblerFixed {
        let mut f = X64AssemblerFixed::new(
            addr,
            0x0000_0000_0000_4000
        );

        // Load from RSI
        dynasm!(f ; mov rax, QWORD [rsi]);

        f.emit_lfence();
        dynasm!(f
            ; .align 64
            ; jmp r14
            ; lfence
        );
        f.commit().unwrap();
        f
    }

    /// Emit the code for a particular test.
    ///
    /// FIXME: For now, the target locations are fixed.
    /// Although presumably, this doesn't actually matter here?
    fn emit_test(
        victim_addr: usize,
        attacker_addr: usize,
        kind: IndBrn,
    ) -> BTBAliasTest {
        let victim = Self::emit_indir_brn(victim_addr, kind);
        let attacker = Self::emit_indir_brn(attacker_addr, kind);
        let (victim_tgt, attacker_tgt) = match kind {
            IndBrn::CallReg | IndBrn::CallMem => {
                (
                    Self::emit_victim_target(0x2222_4000),
                    Self::emit_attacker_target(0x2222_8000),
                )
            },
            IndBrn::JmpReg | IndBrn::JmpMem => {
                (
                    Self::emit_victim_target(0x2223_4000),
                    Self::emit_attacker_target(0x2223_8000),
                )
            },
        };
        BTBAliasTest {
            victim, attacker, victim_tgt, attacker_tgt
        }
    }

    /// Run the test, then measure the probe cacheline and return the result
    fn run_test(
        victim_fn: MeasuredFn,
        attacker_fn: MeasuredFn,
        victim_tgt: usize,
        attacker_tgt: usize,
    ) -> usize
    {
        // NOTE: Try to make the state of the ITA consistent by taking some
        // number of back-to-back indirect JMPs (?)
        //
        // Doing this seems to make the signal stronger and significantly more
        // repeatable across test iterations. Note that this doesn't seem to be
        // the case if we use back-to-back direct JMPs here instead.
        //
        // Presumably, apart from the fact that BTB entries might be aliasing
        // here, the indirect predictor also has some internal state, and is
        // also probably very confused about the target address. When we
        // predict incorrectly with an aliasing entry, presumably the ITA is
        // still being trained with the correct target (?)
        //
        // In any case, this seems to reliably untrain whatever effects have
        // been left by the previous test iteration.

        clear_ghist_indir::<8192>();

        // Call the attacker, training the attacker's branch into the BTB
        // (while avoiding an access to the probe cacheline)
        (attacker_fn)(attacker_tgt, Self::PROBE_ADDR + 0x440);

        // Explicitly flush the probe cacheline
        Self::flush_probe();

        // Call the victim, invoke the victim's indirect branch.
        (victim_fn)(victim_tgt, Self::PROBE_ADDR);

        unsafe {
            // Stall for pending memory operations.
            // If the load occurred, the effects should be visible afterwards.
            core::arch::x86_64::_mm_mfence();
            core::arch::x86_64::_mm_lfence();

            // Measure a load to the probe cacheline
            let t0 = rdpru();
            core::ptr::read_volatile::<u64>(Self::PROBE_ADDR as _);
            let t1 = rdpru();
            t1 - t0
        }
    }

    /// Emit and run a test specified by [`BTBAliasArgs`].
    fn run(arg: BTBAliasArgs) {

        // Try to determine the noise floor for reading APERF with RDPRU.
        let mut aperf_floor = usize::MAX;
        for _ in 0..1024 {
            let t0 = rdpru();
            let t1 = rdpru();
            if (t1 - t0) < aperf_floor {
                aperf_floor = t1 - t0;
            }
        }

        // If an address was not provided, assume the caller wants us to
        // generate a colliding address for them
        let attacker_addr = if let Some(x) = arg.attacker_addr {
            x
        } else {
            Self::generate_collision_for(arg.victim_addr)
        };

        let test = Self::emit_test(arg.victim_addr, attacker_addr, arg.kind);

        let mut results = RawResults(vec![0; Self::ITERS]);
        let probe_base = PerfectEnv::mmap_fixed(Self::PROBE_ADDR, 0x1000);

        println!("[*] Testing with {:?} ({})", arg.kind, arg.desc);
        println!("  victim_brn:   {:016x}", arg.victim_addr);
        println!("  attacker_brn: {:016x}", attacker_addr);
        println!("  victim_tgt:   {:016x}", test.victim_tgt.ptr as usize);
        println!("  attacker_tgt: {:016x}", test.attacker_tgt.ptr as usize);

        // Run the test a couple of times
        for i in 0..Self::ITERS {
            let res = Self::run_test(
                test.victim.as_fn(),
                test.attacker.as_fn(),
                test.victim_tgt.ptr as _,
                test.attacker_tgt.ptr as _
            );
            results.0[i] = res - aperf_floor;
        }
        println!("  measurements:");
        for line in results.0.chunks(16) {
            println!("    {:?}", line);
        }
        println!();

    }

}