-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate_data_files.py
More file actions
277 lines (225 loc) · 11 KB
/
generate_data_files.py
File metadata and controls
277 lines (225 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import argparse
import glob
import os
import re
import sys
import shutil
import tempfile
import subprocess
import yaml
from collections import defaultdict, OrderedDict
from datetime import datetime, timezone
from easybuild.tools.version import VERSION as EASYBUILD_VERSION
from easybuild.framework.easyconfig.easyconfig import process_easyconfig, get_toolchain_hierarchy
from easybuild.tools.options import set_up_configuration
from easybuild.tools.include import include_easyblocks
from contextlib import contextmanager
VALID_EESSI_VERSIONS = ["2025.06", "2023.06"]
EESSI_REFERENCE_ARCHITECTURE = "x86_64/intel/icelake"
# Give order to my toolchains so I can easily figure out what "latest" means
EESSI_SUPPORTED_TOP_LEVEL_TOOLCHAINS = OrderedDict(
{
"2025.06": [
{"name": "foss", "version": "2025b"},
{"name": "foss", "version": "2025a"},
{"name": "foss", "version": "2024a"},
],
"2023.06": [
{"name": "foss", "version": "2023b"},
{"name": "foss", "version": "2023a"},
{"name": "foss", "version": "2022b"},
],
}
)
@contextmanager
def suppress_stdout():
old_stdout = sys.stdout
sys.stdout = open(os.devnull, "w")
try:
yield
finally:
sys.stdout.close()
sys.stdout = old_stdout
def module_dict_from_module_string(module):
module_name, module_version = module.split("/", 1)
module_dict = {"module_name": module_name, "module_version": module_version, "full_module_name": module}
return module_dict
def load_and_list_modules(full_module_name):
"""
Run `module load <name>` and `module list` inside a subshell.
Returns the list of loaded modules visible inside that subshell.
Does not modify Python's environment.
"""
# Run as one shell script so the same session is used
cmd = f"""
module load {full_module_name} || exit 1
module --terse list 2>&1
"""
result = subprocess.run(["bash", "-c", cmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
if result.returncode != 0:
raise RuntimeError(f"Failed to load module '{full_module_name}':\n{result.stdout}")
# Parse module list output
modules = [module_dict_from_module_string(line) for line in result.stdout.splitlines() if "/" in line]
# Filter out the modules we expect to be loaded
eessi_extend_module_name = "EESSI-extend"
eb_module_name = "EasyBuild"
if full_module_name.startswith(f"{eessi_extend_module_name}/"):
# Don't filter anything
pass
elif full_module_name.startswith(f"{eb_module_name}/"):
# Filter EESSI-extend
modules = [module for module in modules if module["module_name"] != eessi_extend_module_name]
else:
# Filter EESSI-extend and EasyBuild
modules = [
module
for module in modules
if module["module_name"] != eessi_extend_module_name and module["module_name"] != eb_module_name
]
return modules
def use_timestamped_reprod_if_exists(original_path):
"""
Replace the last 'software' with 'reprod' and insert the latest timestamp directory
after the version directory if it exists.
"""
# Default to returning the original path
returned_path = original_path
# Split path
parts = original_path.strip(os.sep).split(os.sep)
# Find the last occurrence of 'software'
idx = len(parts) - 1 - parts[::-1].index("software")
# Replace 'software' by 'reprod'
parts[idx] = "reprod"
# Path up to version directory (software/software/version)
pre_timestamp = os.sep.join([""] + parts[: idx + 3])
# Path after version directory (easybuild/reprod/easyblocks)
post_version = parts[idx + 3 :]
# Look for timestamp directories under pre_timestamp
timestamp_dirs = [d for d in glob.glob(os.path.join(pre_timestamp, "*")) if os.path.isdir(d)]
if timestamp_dirs:
latest_timestamp = max(timestamp_dirs) # lexicographic order
# Reconstruct path: reprod/.../version/<latest_timestamp>/easybuild/reprod/easyblocks
final_path = os.path.join(pre_timestamp, latest_timestamp, *post_version)
if os.path.exists(final_path):
returned_path = final_path
return returned_path
def collect_eb_files(base_path):
"""
Scan for .eb files and their corresponding *-easybuild-devel files,
extract the major EasyBuild version from devel files, and group .eb files by major version.
For folders containing 'EasyBuild' or 'EESSI-extend', assume the loaded EasyBuild version if extraction fails.
Parameters:
base_path (str): Root folder to scan for .eb files.
Returns:
dict: {major_version: [list of .eb file paths]}
"""
eb_files_by_version = defaultdict(list)
version_pattern = re.compile(r"software/EasyBuild/(\d+)\.(\d+)\.(\d+)/bin")
# Get major version from loaded EasyBuild installation for exceptions
easybuild_major_version = str(EASYBUILD_VERSION.version[0])
# Find all .eb files recursively
eb_files = glob.glob(os.path.join(base_path, "*/*/easybuild/*.eb"))
for eb_file in eb_files:
folder = os.path.dirname(eb_file)
# Look for the -easybuild-devel file in the same folder
devel_files = glob.glob(os.path.join(folder, "*-easybuild-devel"))
if not devel_files:
raise FileNotFoundError(f"No *-easybuild-devel file found in folder: {folder}")
# Pick the latest devel file if multiple exist
latest_devel = max(devel_files, key=os.path.getmtime)
# Extract the EasyBuild version
with open(latest_devel, "r") as f:
content = f.read()
match = version_pattern.search(content)
# Handle exception folders
if "EasyBuild" in folder or "EESSI-extend" in folder:
major_version = match.group(1) if match else easybuild_major_version
# Don't add EESSI-extend to EB4 or the same file will appear twice
if "EESSI-extend" in folder and major_version == "4":
continue
else:
if not match:
raise ValueError(f"Cannot extract EasyBuild version from file: {latest_devel}")
major_version = match.group(1)
eb_files_by_version[f"{major_version}"].append(eb_file)
return dict(eb_files_by_version)
if __name__ == "__main__":
# The EESSI version is provided as an argument
parser = argparse.ArgumentParser(description="EESSI version to scan.")
parser.add_argument(
"--eessi-version",
"-e",
required=True,
choices=VALID_EESSI_VERSIONS,
help=f"Allowed versions: {', '.join(VALID_EESSI_VERSIONS)}",
)
args = parser.parse_args()
eessi_version = args.eessi_version
print(f"Using EESSI version: {eessi_version}")
# We use a single architecture path to gather information about the software versions
base_path = (
f"/cvmfs/software.eessi.io/versions/{eessi_version}/software/linux/{EESSI_REFERENCE_ARCHITECTURE}/software/"
)
result = collect_eb_files(base_path)
set_up_configuration(args="")
tmpdir = tempfile.mkdtemp()
# Store all our data in a dict
eessi_software = {"eessi_version": {}}
eessi_software["eessi_version"][eessi_version] = {}
# Add a timestamp
eessi_software["timestamp"] = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
# Store the toolchain hierarchies supported by the EESSI version
eessi_software["eessi_version"][eessi_version]["toolchain_hierarchy"] = {}
for top_level_toolchain in EESSI_SUPPORTED_TOP_LEVEL_TOOLCHAINS[eessi_version]:
# versions are typically 2024a/2024b etc. for top level toolchains
# so let's use that to make sorting easy
toolchain_family = f"{top_level_toolchain['version']}_{top_level_toolchain['name']}"
# Get the hierarchy and always add the system toolchain
eessi_software["eessi_version"][eessi_version]["toolchain_hierarchy"][toolchain_family] = [
{"name": "system", "version": "system"}
] + get_toolchain_hierarchy(top_level_toolchain)
for eb_version_of_install, files in sorted(result.items()):
print(f"Major version {eb_version_of_install}:")
if eb_version_of_install == str(EASYBUILD_VERSION.version[0]):
total_files = len(files)
for i, file in enumerate(files, start=1):
percent = (i / total_files) * 100
print(f"{percent:.1f}% - {file}")
# Don't try to parse an EasyBuild easyconfig that is not the same major release
if "/software/EasyBuild/" in file and f"/EasyBuild/{eb_version_of_install}" not in file:
continue
# print(process_easyconfig(path)[0]['ec'].asdict())
eb_hooks_path = use_timestamped_reprod_if_exists(f"{os.path.dirname(file)}/reprod/easyblocks")
easyblocks_dir = include_easyblocks(tmpdir, [eb_hooks_path + "/*.py"])
with suppress_stdout():
parsed_ec = process_easyconfig(file)[0]
# included easyblocks are the first entry in sys.path, so just pop them but keep a list of what was used
sys.path.pop(0)
easyblocks_used = [
os.path.basename(f)
for f in glob.glob(f"{easyblocks_dir}/**/*.py", recursive=True)
if os.path.basename(f) != "__init__.py"
]
shutil.rmtree(easyblocks_dir)
# Store everything we now know about the installation as a dict
# Use the path as the key since we know it is unique
eessi_software["eessi_version"][eessi_version][file] = parsed_ec["ec"].asdict()
eessi_software["eessi_version"][eessi_version][file]["mtime"] = os.path.getmtime(file)
# Make sure we can load the module before adding it's information to the main dict
try:
eessi_software["eessi_version"][eessi_version][file]["required_modules"] = load_and_list_modules(
parsed_ec["full_mod_name"]
)
except RuntimeError as e:
print(f"Ignoring {file} due to error processing module: {e}")
eessi_software["eessi_version"][eessi_version].pop(file)
continue
# Add important data that is related to the module environment
eessi_software["eessi_version"][eessi_version][file]["module"] = module_dict_from_module_string(
parsed_ec["full_mod_name"]
)
# Retain the easyblocks used so we can use a heuristic to figure out the type of extensions (R, Python, Perl)
eessi_software["eessi_version"][eessi_version][file]["easyblocks"] = easyblocks_used
# Store the result
with open(f"eessi_software_{eessi_version}-eb{str(EASYBUILD_VERSION.version[0])}.yaml", "w") as f:
yaml.dump(eessi_software, f)