|
| 1 | +# Basic analog CiM macro. |
| 2 | +# A simple analog compute-in-memory macro demonstrating the fundamental |
| 3 | +# components of an analog CiM array: row drivers (DAC), column drivers, |
| 4 | +# ADC, memory cells, and a virtualized MAC compute unit. |
| 5 | + |
| 6 | +{{include_text('_include.yaml')}} |
| 7 | +{{add_to_path('./memory_cells')}} |
| 8 | + |
| 9 | +arch: |
| 10 | + variables: |
| 11 | + <<: *variables_global |
| 12 | + |
| 13 | + # =========================================================================== |
| 14 | + # Encoding-dependent parameters |
| 15 | + # =========================================================================== |
| 16 | + encoded_input_bits: input_bits |
| 17 | + encoded_weight_bits: weight_bits |
| 18 | + encoded_output_bits: output_bits |
| 19 | + |
| 20 | + input_encoding_func: offset_encode_hist |
| 21 | + weight_encoding_func: offset_encode_hist |
| 22 | + |
| 23 | + # For accuracy model. Can in-array accumulation include signed values? |
| 24 | + # Signed accumulation not compatible with offset encoding (since offset |
| 25 | + # encoding makes values non-negative). |
| 26 | + signed_sum_across_inputs: False |
| 27 | + signed_sum_across_weights: False |
| 28 | + |
| 29 | + # =========================================================================== |
| 30 | + # Architecture & CiM Array Structure |
| 31 | + # =========================================================================== |
| 32 | + cim_unit_width_cells: 1 |
| 33 | + cim_unit_depth_cells: 1 |
| 34 | + bits_per_cell: 8 |
| 35 | + |
| 36 | + # =========================================================================== |
| 37 | + # Data Converters |
| 38 | + # =========================================================================== |
| 39 | + adc_resolution: 8 |
| 40 | + voltage_dac_resolution: 1 |
| 41 | + temporal_dac_resolution: 8 |
| 42 | + |
| 43 | + n_adc_per_bank: 2 |
| 44 | + |
| 45 | + # =========================================================================== |
| 46 | + # Hardware |
| 47 | + # =========================================================================== |
| 48 | + cycle_period: 1e-7 * voltage_latency_scale |
| 49 | + read_pulse_width: 1e-9 |
| 50 | + |
| 51 | + extra_attributes_for_all_component_models: |
| 52 | + <<: *cim_component_attributes |
| 53 | + tech_node: tech_node |
| 54 | + cycle_period: cycle_period |
| 55 | + |
| 56 | + nodes: |
| 57 | + - !Toll # ADC: Column readout |
| 58 | + name: ADC |
| 59 | + tensors: {keep: output} |
| 60 | + direction: up |
| 61 | + bits_per_action: average_output_bits_per_sliced_psum |
| 62 | + component_class: ADC |
| 63 | + energy_scale: adc_energy_scale |
| 64 | + area_scale: adc_area_scale |
| 65 | + extra_attributes_for_component_model: |
| 66 | + n_bits: adc_resolution |
| 67 | + throughput_scale: 1 |
| 68 | + throughput: 1 / cycle_period * cols_active_at_once * throughput_scale |
| 69 | + |
| 70 | + - !Toll # Column drivers precharge the array columns |
| 71 | + name: ColumnDrivers |
| 72 | + tensors: {keep: output} |
| 73 | + direction: up |
| 74 | + bits_per_action: average_output_bits_per_sliced_psum |
| 75 | + component_class: ArrayColumnDrivers |
| 76 | + |
| 77 | + - !Toll # Row drivers feed inputs onto the rows of the array |
| 78 | + name: RowDrivers |
| 79 | + tensors: {keep: input} |
| 80 | + direction: down |
| 81 | + bits_per_action: average_input_bits_per_slice |
| 82 | + component_class: ArrayRowDrivers |
| 83 | + extra_attributes_for_component_model: |
| 84 | + temporal_spiking: true |
| 85 | + |
| 86 | + # This memory catches sliding windows that may be sent spatially in the array. E.g., |
| 87 | + # convolution steps spatially unrolled onto columns with overlapping windows. Size = |
| 88 | + # one input value per row. no_resend_to_below prevents reuse across temporal |
| 89 | + # iterations. |
| 90 | + - !Memory |
| 91 | + name: DummyRowDriverMemory |
| 92 | + component_class: Dummy |
| 93 | + size: input.bits_per_value * array_parallel_inputs |
| 94 | + tensors: {keep: input, no_resend_to_below: input} |
| 95 | + |
| 96 | + - !Container # Each column stores a different weight slice. Columns share inputs. |
| 97 | + name: Column |
| 98 | + spatial: |
| 99 | + - name: column_ARRAY_COLUMNS |
| 100 | + fanout: 32 |
| 101 | + may_reuse: input |
| 102 | + min_usage: 1 |
| 103 | + usage_scale: n_weight_slices |
| 104 | + |
| 105 | + - !Container # Each row receives a different input slice. Rows share outputs. |
| 106 | + name: Row |
| 107 | + spatial: |
| 108 | + - name: row_ARRAY_ROWS |
| 109 | + fanout: 32 |
| 110 | + may_reuse: output |
| 111 | + reuse: output |
| 112 | + min_usage: 1 |
| 113 | + |
| 114 | + # CiM unit stores weights and computes MACs. |
| 115 | + - !Memory |
| 116 | + name: CimUnit |
| 117 | + tensors: {keep: weight, no_refetch_from_above: weight, force_memory_hierarchy_order: False} |
| 118 | + size: cim_unit_width_cells * cim_unit_depth_cells * bits_per_cell * n_weight_slices |
| 119 | + bits_per_action: average_weight_bits_per_sliced_psum |
| 120 | + n_parallel_instances: n_weight_slices |
| 121 | + component_class: MemoryCell |
| 122 | + actions: [{name: read, latency: cycle_period}] |
| 123 | + extra_attributes_for_component_model: |
| 124 | + n_instances: cim_unit_width_cells * cim_unit_depth_cells |
| 125 | + |
| 126 | + # We account for compute energy in the CimUnit reads |
| 127 | + - !Compute |
| 128 | + name: FreeCompute |
| 129 | + component_class: Dummy |
| 130 | + enabled: len(All) == 3 |
| 131 | + |
| 132 | + |
| 133 | +# These variables pertain to the workload, microarch, and circuits. |
| 134 | +variables: |
| 135 | + inputs_hist: [0, 0, 0, 3, 2, 1, 0] |
| 136 | + weights_hist: ([1] * 15) |
| 137 | + outputs_hist: inputs_hist |
| 138 | + |
| 139 | + ## Microarch ---------------------------------------------------------------- |
| 140 | + supported_input_bits: 8 |
| 141 | + supported_weight_bits: 8 |
| 142 | + supported_output_bits: 8 |
| 143 | + min_supported_input_bits: 1 |
| 144 | + min_supported_weight_bits: 1 |
| 145 | + min_supported_output_bits: 1 |
| 146 | + |
| 147 | + # Circuits ------------------------------------------------------------------ |
| 148 | + voltage: 1 |
| 149 | + tech_node: 65e-9 # 65nm |
| 150 | + cell_config: "{{find_path('rram_example.yaml')}}" |
| 151 | + voltage_energy_scale: voltage ** 2 |
| 152 | + voltage_latency_scale: 1 / voltage |
| 153 | + |
| 154 | + # Calibration --------------------------------------------------------------- |
| 155 | + adc_energy_scale: voltage_energy_scale |
| 156 | + adc_area_scale: 1 |
| 157 | + row_col_drivers_area_scale: 1 |
| 158 | + |
| 159 | + |
| 160 | +# This workload is sized to get peak throughput & energy efficiency. |
| 161 | +# 32 columns × 32 rows fills the array. |
| 162 | +workload: |
| 163 | + rank_sizes: |
| 164 | + M: 1 |
| 165 | + N: 32 |
| 166 | + K: 32 |
| 167 | + |
| 168 | + einsums: |
| 169 | + - name: Matmul |
| 170 | + tensor_accesses: |
| 171 | + - {name: input, projection: [m, k], bits_per_value: 8} |
| 172 | + - {name: weight, projection: [k, n], bits_per_value: 8} |
| 173 | + - {name: output, projection: [m, n], output: True, bits_per_value: 8} |
| 174 | + |
| 175 | +renames: {} # Not needed for this workload |
0 commit comments