hpcgroup
diff --git a/‎examples/64.sh‎
Lines changed: 0 additions & 64 deletions b/‎examples/64.sh‎
Lines changed: 0 additions & 64 deletions
diff --git a/‎examples/README.md‎
Lines changed: 17 additions & 0 deletions b/‎examples/README.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/parse_results.py‎
Lines changed: 68 additions & 0 deletions b/‎examples/parse_results.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎examples/process_comm_model.py‎
Lines changed: 0 additions & 117 deletions b/‎examples/process_comm_model.py‎
Lines changed: 0 additions & 117 deletions
diff --git a/‎examples/4.sh‎ ‎examples/run_4.sh‎examples/4.sh renamed to examples/run_4.sh
Lines changed: 5 additions & 6 deletions b/‎examples/4.sh‎ ‎examples/run_4.sh‎examples/4.sh renamed to examples/run_4.sh
Lines changed: 5 additions & 6 deletions
diff --git a/‎performance/README.md‎
Lines changed: 8 additions & 0 deletions b/‎performance/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎performance/comm_model.py‎
Lines changed: 12 additions & 4 deletions b/‎performance/comm_model.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎performance/comp_model.py‎
Lines changed: 19 additions & 9 deletions b/‎performance/comp_model.py‎
Lines changed: 19 additions & 9 deletions
@@ -0,0 +1,17 @@
+## Files
+
+-   **train.py**: This file contains the model definition, data loading, training loop, and other essential components for training a GNN.
+
+-   **run_4.sh**: This is an example shell script for Perlmutter, demonstrating how to run a Plexus-parallelized GNN on 4 GPUs.  It includes placeholders that should be replaced with appropriate values for specific experiments, such as dataset path, output directory, etc. The script can be adapted to run on different numbers of GPUs and with different datasets.
+
+    For example, the script can be launched using:
+    ```bash
+    sbatch run_4.sh 1 1 4 0
+    ```
+    This would execute the training with a 3D parallelism configuration of (X, Y, Z) = (1, 1, 4) for trial number 0. The trial number is often used to differentiate output files from multiple runs.
+
+-   **get_rank.sh**: This shell script is used to set the ranks for the GPUs involved in the distributed training process. It also limits the core dump file size to 0.
+
+-   **parse_results.py**: This Python script contains the `process_log_file` function, which can be used to parse the timing results from the output log file generated by a training run. 
+
+-   `export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"`: This can be set if there are warnings about fragmentation which can cause GPU OOM issues. 
@@ -0,0 +1,68 @@
+import re
+
+
+def extract_avg_time(line):
+    match = re.search(r"Avg Time: ([0-9]*\.?[0-9]+)", line)
+    return float(match.group(1)) if match else 0
+
+
+def process_log_file(filename, warmup):
+    """
+    Args:
+        filename - path to file to parse
+        warmup - number of epochs to ignore in the calculation
+
+    Returns:
+        tuple containing the epoch time and communication time, averaged across non-warmup epochs
+    """
+
+    comm_times, epoch_times = [], []
+    comm_time, comp_time, cross_time = None, None, None
+
+    with open(filename, "r") as file:
+        for line in file:
+            line = line.strip()
+
+            if (
+                "epoch " in line
+                and comm_time is not None
+                and comp_time is not None
+                and cross_time is not None
+            ):
+                epoch_times.append(comp_time + comm_time + cross_time)
+                comm_times.append(comm_time)
+                comm_time = 0
+                comp_time = 0
+                cross_time = 0
+            elif "epoch " in line:
+                comm_time = 0
+                comp_time = 0
+                cross_time = 0
+            elif comm_time is not None and any(
+                keyword in line
+                for keyword in ["gather ", "all-reduce ", "reduce-scatter "]
+            ):
+                comm_time += extract_avg_time(line)
+            elif comp_time is not None and any(
+                keyword in line
+                for keyword in [
+                    "AGG = A * H ",
+                    "OUT = AGG * W ",
+                    "GRAD_W = AGG.T * GRAD_OUT ",
+                    "GRAD_AGG = GRAD_OUT * W.T ",
+                    "GRAD_H = A.T * GRAD_AGG ",
+                ]
+            ):
+                comp_time += extract_avg_time(line)
+            elif cross_time is not None and any(
+                keyword in line for keyword in ["cross entropy"]
+            ):
+                cross_time += extract_avg_time(line)
+
+    if comm_time is not None and comp_time is not None and cross_time is not None:
+        epoch_times.append(comp_time + comm_time + cross_time)
+        comm_times.append(comm_time)
+
+    return sum(epoch_times[warmup:]) / (len(epoch_times) - warmup), sum(
+        comm_times[warmup:]
+    ) / (len(comm_times) - warmup)
@@ -1,15 +1,14 @@
 #!/bin/bash
-#SBATCH -q debug
 #SBATCH --time=00:10:00
 #SBATCH --gpus-per-node=4
-#SBATCH -A m2404_g
+#SBATCH -A <account>
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=4
 #SBATCH -C gpu 
 
 module load nccl
 module load cudatoolkit/12.4
-source $SCRATCH/gnn-env/bin/activate
+source <path/to/venv/bin/activate>
 
 NNODES=$SLURM_JOB_NUM_NODES
 GPUS=$(( NNODES * 4 ))
@@ -44,10 +43,10 @@ G_INTRA_C=$2
 G_INTRA_D=$3
 TRIAL_NUM=$4
 
-SCRIPT="../../../main/train.py --G_intra_r ${G_INTRA_R} --G_intra_c ${G_INTRA_C} --G_intra_d ${G_INTRA_D} --gpus_per_node ${GPUS_PER_NODE} --num_epochs 10"
-SCRIPT="$SCRIPT --data_dir $SCRATCH/gnn-env/gnn-datasets/partitioned_products"
+SCRIPT="train.py --G_intra_r ${G_INTRA_R} --G_intra_c ${G_INTRA_C} --G_intra_d ${G_INTRA_D} --gpus_per_node ${GPUS_PER_NODE} --num_epochs 10"
+SCRIPT="$SCRIPT --data_dir <path/to/dataset>"
 
-run_cmd="srun -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 ../.././get_rank.sh python -u $SCRIPT > ../../../results/products/perlmutter/scaling/${GPUS}/${TRIAL_NUM}/products_X${G_INTRA_R}Y${G_INTRA_C}Z${G_INTRA_D}.txt 2>&1"
+run_cmd="srun -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 ./get_rank.sh python -u $SCRIPT > <path/to/save/results/to>/<dataset/output/file/name>_X${G_INTRA_R}Y${G_INTRA_C}Z${G_INTRA_D}_${TRIAL_NUM}.txt 2>&1"
 
 echo $run_cmd
 eval $run_cmd
@@ -0,0 +1,8 @@
+## Files
+
+-   **comm_model.py**: This file is used to model communication times.
+
+-   **comp_model.py**: This file is used to model the computation time of Sparse Matrix-Matrix Multiplication (SpMM), a core operation in GNN training.
+
+-   **mem_model.py**: This file is used to model the GPU memory usage of different 3D configurations.
+
@@ -76,6 +76,18 @@ def get_bw(ip, my, GPUS_PER_NODE, version, machine):
 
 
 def compute_config_costs(G, N, D_list, version, machine):
+    """
+    Args:
+        G - number of gpus
+        N - number of nodes
+        D_list - list of features at each layer (ex: 3 GCN layers with 128 hidden dim, 100 feature size, 60 classes [100, 128, 128, 60])
+        version - "v1 for placement/bandwidth agnostic, v2 for placement aware with theoretical bandwidth, v3 for placement aware with empirical bandwidths"
+        machine - currently supports perlmutter and frontier, but bandwidths for other machines can also be added
+
+    Returns:
+        Estimated communication time (ms) for each 3D config
+    """
+
     if machine == "perlmutter":
         GPUS_PER_NODE = 4
     elif machine == "frontier":
@@ -195,7 +207,3 @@ def compute_config_costs(G, N, D_list, version, machine):
     config_to_cost = dict(sorted(config_to_cost.items(), key=lambda item: item[1]))
 
     return config_to_cost
-
-
-if __name__ == "__main__":
-    print(compute_config_costs(64, 2449029, [100, 128, 128, 47], "v3", "perlmutter"))
 
@@ -19,7 +19,18 @@ def split_into_three_powers_of_two(G):
 
 
 # don't include number of classes in D_list
-def comp_model(N, NNZ, G, D_list):
+def comp_model(N, NNZ, G, D_list, coef=[1, 1, 1]):
+    """
+    Args:
+        N - number of nodes in graph
+        NNZ - number of nonzeros in graph's adjacency matrix
+        D_list - list of features at each layer excluding number of classes (ex: 3 GCN layers with 128 hidden dim, 100 feature size, [100, 128, 128])
+        coef - coefficients to multiply the three terms of the model by to get times in ms (default coefficients don't result in meaningful times, but give an ordering of the configs
+
+    Returns:
+        Estimated SpMM time (ms) for each 3D config
+    """
+
     cost_dict = dict()
     for x, y, z in split_into_three_powers_of_two(G):
         flops_cost, fwd_penalty, bwd_penalty = 0, 0, 0
@@ -36,13 +47,12 @@ def comp_model(N, NNZ, G, D_list):
                 D_list[i] * [x, z, y][(i + 1) % 3]
             )
 
-        curr_cost = flops_cost + fwd_penalty + bwd_penalty
-
-        cost_dict[(x, y, z)] = (curr_cost, flops_cost, fwd_penalty, bwd_penalty)
-    cost_dict = dict(sorted(cost_dict.items(), key=lambda kv: kv[1][0]))
-    return cost_dict
+        cost_dict[(x, y, z)] = (
+            (coef[0] * (flops_cost**0.5))
+            + (coef[1] * (flops_cost**0.5) * fwd_penalty)
+            + (coef[2] * (flops_cost**0.5) * bwd_penalty)
+        )
 
+    cost_dict = dict(sorted(cost_dict.items(), key=lambda kv: kv[1]))
 
-if __name__ == "__main__":
-    x = comp_model(2449029, 126167053, 64, [100, 128, 128])
-    print(x.keys())
+    return cost_dict