Accelergy-Project
diff --git a/‎examples/arches/compute_in_memory/_include.yaml‎
Lines changed: 4 additions & 1 deletion b/‎examples/arches/compute_in_memory/_include.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/arches/compute_in_memory/_include_functions.py‎
Lines changed: 20 additions & 12 deletions b/‎examples/arches/compute_in_memory/_include_functions.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎examples/arches/compute_in_memory/_load_spec.py‎
Lines changed: 17 additions & 2 deletions b/‎examples/arches/compute_in_memory/_load_spec.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎examples/arches/compute_in_memory/basic_analog.yaml‎
Lines changed: 175 additions & 0 deletions b/‎examples/arches/compute_in_memory/basic_analog.yaml‎
Lines changed: 175 additions & 0 deletions
@@ -53,6 +53,9 @@ variables_global: &variables_global
 
   average_input_bits_per_slice: encoded_input_bits / n_input_slices
   average_weight_bits_per_slice: encoded_weight_bits / n_weight_slices
+  average_input_bits_per_sliced_psum: encoded_input_bits / n_sliced_psums
+  average_weight_bits_per_sliced_psum: encoded_weight_bits / n_sliced_psums
+  average_output_bits_per_sliced_psum: encoded_output_bits / n_sliced_psums
 
   # This is for the bitwise-multiplication of the input and weight slices
   n_virtual_macs: max_input_bits_per_slice * max_weight_bits_per_slice * encoded_output_bits
@@ -71,4 +74,4 @@ variables_global: &variables_global
 
   n_input_slices:       max(ceil(in_b / max_input_bits_per_slice), min_input_slices)
   n_weight_slices:      max(ceil(w_b / max_weight_bits_per_slice), min_weight_slices)
-  n_sliced_psums:       n_input_slices * n_weight_slices
+  n_sliced_psums:       n_input_slices * n_weight_slices
@@ -2,23 +2,25 @@
 
 
 def get_array_fanout_reuse_input(spec: af.Spec) -> int:
-    n_rows = 1
+    """Get total fanout of array spatial dims that reuse input (= columns)."""
+    n = 1
     for leaf in spec.arch.get_nodes_of_type(af.arch.Leaf):
-        if "array_reuse_input" in leaf.spatial:
-            fanout = leaf.spatial["array_reuse_input"]["fanout"]
-            assert isinstance(fanout, (int, float)), f"fanout {leaf.name}.spatial.array_reuse_input.fanout is not a number"
-            n_rows *= fanout
-    return n_rows
+        for sp in leaf.spatial:
+            if sp.name.endswith("ARRAY_COLUMNS") or sp.name.endswith("ARRAY_ROWS"):
+                if str(sp.may_reuse) == "input" or str(sp.reuse) == "input":
+                    n *= sp.fanout
+    return n
 
 
 def get_array_fanout_reuse_output(spec: af.Spec) -> int:
-    n_cols = 1
+    """Get total fanout of array spatial dims that reuse output (= rows)."""
+    n = 1
     for leaf in spec.arch.get_nodes_of_type(af.arch.Leaf):
-        if "array_reuse_output" in leaf.spatial:
-            fanout = leaf.spatial["array_reuse_output"]["fanout"]
-            assert isinstance(fanout, (int, float)), f"fanout {leaf.name}.spatial.array_reuse_output.fanout is not a number"
-            n_cols *= fanout
-    return n_cols
+        for sp in leaf.spatial:
+            if sp.name.endswith("ARRAY_COLUMNS") or sp.name.endswith("ARRAY_ROWS"):
+                if str(sp.may_reuse) == "output" or str(sp.reuse) == "output":
+                    n *= sp.fanout
+    return n
 
 
 def get_array_fanout_total(spec: af.Spec) -> int:
@@ -33,10 +35,12 @@ def get_array_fanout_total(spec: af.Spec) -> int:
 from math import log2
 from typing import List, NamedTuple, Union
 
+
 class ProbableBits(NamedTuple):
     bits: list
     probability: float
 
+
 # ==============================================================================
 # Encoding functions
 # ==============================================================================
@@ -55,6 +59,7 @@ def magnitude_encode_hist(weights) -> List[ProbableBits]:
         encoded.append(ProbableBits(to_bits_unsigned(abs(normed), nbits)[1:], w))
     return norm_encoded_hist(encoded)
 
+
 def two_part_magnitude_encode_hist(weights):
     """
     Two (devices, timesteps, components, etc.) encode each signed value. If the
@@ -69,6 +74,7 @@ def two_part_magnitude_encode_hist(weights):
         m2.append(ProbableBits([0] * len(e.bits), e.probability / 2))
     return m2
 
+
 def offset_encode_hist(weights):
     """
     A signed value is encoded as the the value minus the negative minimum value.
@@ -132,10 +138,12 @@ def zero_gated_xnor_encode_hist(weights):
     )
     return encoded
 
+
 # ==============================================================================
 # Helper functions
 # ==============================================================================
 
+
 def assert_hist_pow2_minus1(hist):
     x = 1
     while x <= len(hist):
 
@@ -9,6 +9,7 @@ def get_spec(
     arch_name: str,
     compare_with_arch_name: str | None = None,
     add_dummy_main_memory: bool = False,
+    n_macros: int = 1,
 ) -> af.Spec:
     """
     Gets the spec for the given architecture. If `compare_with_arch_name` is given, the
@@ -22,7 +23,8 @@ def get_spec(
     compare_with_arch_name: str | None
         The name of the architecture to compare with. If not given, variables will be
         taken from the given `arch_name`.
-
+    n_macros: int
+        The number of macros to use in the architecture.
     Returns
     -------
     spec: af.Spec
@@ -33,6 +35,7 @@ def get_spec(
     else:
         compare_with_name = compare_with_arch_name
 
+    arch_name_base = arch_name
     arch_name = os.path.join(THIS_SCRIPT_DIR, f"{arch_name}.yaml")
     compare_with_name = os.path.join(THIS_SCRIPT_DIR, f"{compare_with_name}.yaml")
     variables = af.Variables.from_yaml(arch_name, top_key="variables")
@@ -43,15 +46,27 @@ def get_spec(
     spec.config.expression_custom_functions.append(
         os.path.join(THIS_SCRIPT_DIR, "_include_functions.py")
     )
+    # Load architecture-specific helper functions if they exist
+    arch_helpers = os.path.join(
+        THIS_SCRIPT_DIR, f"{arch_name_base}_helper_functions.py"
+    )
+    if os.path.exists(arch_helpers):
+        spec.config.expression_custom_functions.append(arch_helpers)
     spec.config.component_models.append(
         os.path.join(THIS_SCRIPT_DIR, "components/*.py")
     )
+    if n_macros > 1:
+        macro = af.arch.Container(
+            name="MacroAuto",
+            spatial=[{"name": "macro", "fanout": n_macros, "power_gateable": True}],
+        )
+        spec.arch.nodes.insert(0, macro)
     if add_dummy_main_memory:
         main_memory = af.arch.Memory(
             name="MainMemory",
             component_class="Dummy",
             size=float("inf"),
-            tensors={"keep": "~weight"}
+            tensors={"keep": "~weight"},
         )
         spec.arch.nodes.insert(0, main_memory)
     return spec
@@ -0,0 +1,175 @@
+# Basic analog CiM macro.
+# A simple analog compute-in-memory macro demonstrating the fundamental
+# components of an analog CiM array: row drivers (DAC), column drivers,
+# ADC, memory cells, and a virtualized MAC compute unit.
+
+{{include_text('_include.yaml')}}
+{{add_to_path('./memory_cells')}}
+
+arch:
+  variables:
+    <<: *variables_global
+
+    # ===========================================================================
+    # Encoding-dependent parameters
+    # ===========================================================================
+    encoded_input_bits:  input_bits
+    encoded_weight_bits: weight_bits
+    encoded_output_bits: output_bits
+
+    input_encoding_func: offset_encode_hist
+    weight_encoding_func: offset_encode_hist
+
+    # For accuracy model. Can in-array accumulation include signed values?
+    # Signed accumulation not compatible with offset encoding (since offset
+    # encoding makes values non-negative).
+    signed_sum_across_inputs: False
+    signed_sum_across_weights: False
+
+    # ===========================================================================
+    # Architecture & CiM Array Structure
+    # ===========================================================================
+    cim_unit_width_cells:  1
+    cim_unit_depth_cells:  1
+    bits_per_cell:         8
+
+    # ===========================================================================
+    # Data Converters
+    # ===========================================================================
+    adc_resolution: 8
+    voltage_dac_resolution: 1
+    temporal_dac_resolution: 8
+
+    n_adc_per_bank: 2
+
+    # ===========================================================================
+    # Hardware
+    # ===========================================================================
+    cycle_period: 1e-7 * voltage_latency_scale
+    read_pulse_width: 1e-9
+
+  extra_attributes_for_all_component_models:
+    <<: *cim_component_attributes
+    tech_node: tech_node
+    cycle_period: cycle_period
+
+  nodes:
+  - !Toll # ADC: Column readout
+    name: ADC
+    tensors: {keep: output}
+    direction: up
+    bits_per_action: average_output_bits_per_sliced_psum
+    component_class: ADC
+    energy_scale: adc_energy_scale
+    area_scale: adc_area_scale
+    extra_attributes_for_component_model:
+      n_bits: adc_resolution
+      throughput_scale: 1
+      throughput: 1 / cycle_period * cols_active_at_once * throughput_scale
+
+  - !Toll # Column drivers precharge the array columns
+    name: ColumnDrivers
+    tensors: {keep: output}
+    direction: up
+    bits_per_action: average_output_bits_per_sliced_psum
+    component_class: ArrayColumnDrivers
+
+  - !Toll # Row drivers feed inputs onto the rows of the array
+    name: RowDrivers
+    tensors: {keep: input}
+    direction: down
+    bits_per_action: average_input_bits_per_slice
+    component_class: ArrayRowDrivers
+    extra_attributes_for_component_model:
+      temporal_spiking: true
+
+  # This memory catches sliding windows that may be sent spatially in the array. E.g.,
+  # convolution steps spatially unrolled onto columns with overlapping windows. Size =
+  # one input value per row. no_resend_to_below prevents reuse across temporal
+  # iterations.
+  - !Memory
+    name: DummyRowDriverMemory
+    component_class: Dummy
+    size: input.bits_per_value * array_parallel_inputs
+    tensors: {keep: input, no_resend_to_below: input}
+
+  - !Container # Each column stores a different weight slice. Columns share inputs.
+    name: Column
+    spatial:
+    - name: column_ARRAY_COLUMNS
+      fanout: 32
+      may_reuse: input
+      min_usage: 1
+      usage_scale: n_weight_slices
+
+  - !Container # Each row receives a different input slice. Rows share outputs.
+    name: Row
+    spatial:
+    - name: row_ARRAY_ROWS
+      fanout: 32
+      may_reuse: output
+      reuse: output
+      min_usage: 1
+
+  # CiM unit stores weights and computes MACs.
+  - !Memory
+    name: CimUnit
+    tensors: {keep: weight, no_refetch_from_above: weight, force_memory_hierarchy_order: False}
+    size: cim_unit_width_cells * cim_unit_depth_cells * bits_per_cell * n_weight_slices
+    bits_per_action: average_weight_bits_per_sliced_psum
+    n_parallel_instances: n_weight_slices
+    component_class: MemoryCell
+    actions: [{name: read, latency: cycle_period}]
+    extra_attributes_for_component_model:
+      n_instances: cim_unit_width_cells * cim_unit_depth_cells
+
+  # We account for compute energy in the CimUnit reads
+  - !Compute
+    name: FreeCompute
+    component_class: Dummy
+    enabled: len(All) == 3
+
+
+# These variables pertain to the workload, microarch, and circuits.
+variables:
+  inputs_hist: [0, 0, 0, 3, 2, 1, 0]
+  weights_hist: ([1] * 15)
+  outputs_hist: inputs_hist
+
+  ## Microarch ----------------------------------------------------------------
+  supported_input_bits:  8
+  supported_weight_bits: 8
+  supported_output_bits: 8
+  min_supported_input_bits: 1
+  min_supported_weight_bits: 1
+  min_supported_output_bits: 1
+
+  # Circuits ------------------------------------------------------------------
+  voltage: 1
+  tech_node: 65e-9 # 65nm
+  cell_config: "{{find_path('rram_example.yaml')}}"
+  voltage_energy_scale: voltage ** 2
+  voltage_latency_scale: 1 / voltage
+
+  # Calibration ---------------------------------------------------------------
+  adc_energy_scale: voltage_energy_scale
+  adc_area_scale: 1
+  row_col_drivers_area_scale: 1
+
+
+# This workload is sized to get peak throughput & energy efficiency.
+# 32 columns × 32 rows fills the array.
+workload:
+  rank_sizes:
+    M: 1
+    N: 32
+    K: 32
+
+  einsums:
+  - name: Matmul
+    tensor_accesses:
+    - {name: input, projection: [m, k], bits_per_value: 8}
+    - {name: weight, projection: [k, n], bits_per_value: 8}
+    - {name: output, projection: [m, n], output: True, bits_per_value: 8}
+
+renames: {} # Not needed for this workload