kvcache-view/test_memory.html at main · LMCache/kvcache-view · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
<!doctype html>
<html>
    <head>
        <title>Memory Test</title>
    </head>
    <body>
        <script>
            // Test Llama 3.2 1B memory calculation
            const model = { params: 1.2 }

            // FP16 weights (mixed precision enabled by default)
            const fp16Weights = (model.params * 1e9 * 2) / 1024 ** 3
            console.log('FP16 Weights:', fp16Weights, 'GiB')

            // FP32 master weights for mixed precision
            const fp32MasterWeights = (model.params * 1e9 * 4) / 1024 ** 3
            console.log('FP32 Master Weights:', fp32MasterWeights, 'GiB')

            // Gradients (FP16)
            const gradients = fp16Weights
            console.log('Gradients:', gradients, 'GiB')

            // AdamW optimizer (2x for momentum + variance)
            const optimizer = fp32MasterWeights * 2
            console.log('Optimizer:', optimizer, 'GiB')

            // Total with mixed precision
            const totalMixedPrecision = fp16Weights + fp32MasterWeights + gradients + optimizer
            console.log('Total (Mixed Precision):', totalMixedPrecision, 'GiB')

            // Activations (rough estimate for batch=4, seq=1024)
            const activations = 2 // Conservative estimate
            console.log('Activations:', activations, 'GiB')

            console.log('GRAND TOTAL:', totalMixedPrecision + activations, 'GiB')
            console.log('On W7900 48GB:', (((totalMixedPrecision + activations) / 48) * 100).toFixed(1) + '%')

            // PCIe bandwidth for this amount of data
            const dataPerStep = totalMixedPrecision * 0.5 // Moving half the data per step
            const stepsPerSecond = 4
            const bandwidthNeeded = dataPerStep * stepsPerSecond
            console.log('Bandwidth needed:', bandwidthNeeded, 'GB/s')
            console.log('PCIe 3.0 utilization:', ((bandwidthNeeded / 16) * 100).toFixed(1) + '%')
        </script>
    </body>
</html>