forked from KIT-CMS/ntuple_processor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbooking.py
More file actions
429 lines (385 loc) · 17 KB
/
booking.py
File metadata and controls
429 lines (385 loc) · 17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
from .utils import Dataset
from .utils import Selection
from .utils import Ntuple
from .utils import Cut
from .utils import Weight
from .utils import Action
from .utils import Count
from .utils import Histogram
from .utils import Variation
from ROOT import gROOT
gROOT.SetBatch(True)
from ROOT import TFile
import os
import re
import json
import itertools
import logging
logger = logging.getLogger(__name__)
def dataset_from_artusoutput(
dataset_name, file_names, folder, files_base_directory, friends_base_directories
):
"""Create a Dataset object from a list containing the names
of the ROOT files (e.g. [root_file1, root_file2, (...)]):
ntuple1: /file_base_dir/root_file1/folder/ntuple
friend1: /friend1_base_dir/root_file1/folder/ntuple
friend2: /friend2_base_dir/root_file1/folder/ntuple
ntuple2: /file_base_dir/root_file2/folder/ntuple
friend1: /friend1_base_dir/root_file2/folder/ntuple
friend2: /friend2_base_dir/root_file2/folder/ntuple
ntuple3: /file_base_dir/root_file3/folder/ntuple
friend1: /friend1_base_dir/root_file3/folder/ntuple
friend2: /friend2_base_dir/root_file3/folder/ntuple
(...)
Args:
dataset_name (str): Name of the dataset
file_names (list): List containing the names of the .root
files
folder (str): Name of the TDirectoryFile in each .root file
files_base_directory (str): Path to the files base directory (directories)
friends_base_directories (str, list): List of paths to
the friends base directory (directories)
Returns:
dataset (Dataset): Dataset object containing TTrees
"""
def get_full_tree_name(folder, path_to_root_file, tree_name):
root_file = TFile(path_to_root_file)
if root_file.IsZombie():
logger.fatal("File {} does not exist, abort".format(path_to_root_file))
raise FileNotFoundError
if folder not in root_file.GetListOfKeys():
logger.fatal(
"Folder {} does not exist in {}\n".format(folder, path_to_root_file)
)
raise NameError
root_file.Close()
full_tree_name = "/".join([folder, tree_name])
return full_tree_name
def add_tagged_friends(friends):
"""Tag friends with the name of the different directories
in the artus name scheme, e.g.:
/common_path/MELA/ntuple -> tag: MELA
/common_path/SVFit/ntuple -> tag: SVFit
Since when we compare two ntuples (with full path) only one
directory changes in this scheme (see MELA vs SVFit), we
create a list called 'tags' with these two strings; then we
assign this string to friend.tag, if it's None
"""
for f1, f2 in itertools.combinations(friends, 2):
l1 = f1.path.split("/")
l2 = f2.path.split("/")
tags = list(set(l1).symmetric_difference(set(l2)))
if tags:
for t in tags:
if t in l1 and f1.tag is None:
f1.tag = t
elif t in l2 and f2.tag is None:
f2.tag = t
return friends
# E.g.: file_base_dir/file_name/file_name.root
root_files = [
os.path.join(files_base_directory, f, "{}.root".format(f)) for f in file_names
]
# E.g.: file_base_dir/file_name1/file_name1.root/folder/ntuple
# file_base_dir/file_name1/file_name2.root/folder/ntuple
ntuples = []
for root_file, file_name in zip(root_files, file_names):
tdf_tree = get_full_tree_name(folder, root_file, "ntuple")
friends = []
for friends_base_directory in friends_base_directories:
friend_path = os.path.join(
friends_base_directory, file_name, "{}.root".format(file_name)
)
tdf_tree_friend = get_full_tree_name(folder, friend_path, "ntuple")
if tdf_tree != tdf_tree_friend:
logger.fatal(
"Extracted wrong TDirectoryFile from friend which is not the same than the base file."
)
raise Exception
friends.append(Ntuple(friend_path, tdf_tree_friend))
ntuples.append(Ntuple(root_file, tdf_tree, add_tagged_friends(friends)))
return Dataset(dataset_name, ntuples)
def dataset_from_crownoutput(
dataset_name,
file_names,
era,
channel,
folder,
files_base_directory,
friends_base_directories=None,
validate_samples=False
):
"""Create a Dataset object from a list containing the names
of the ROOT files (e.g. [root_file1, root_file2, (...)]):
ntuple1: /file_base_dir/root_file1/folder/ntuple
friend1: /friend1_base_dir/root_file1/folder/ntuple
friend2: /friend2_base_dir/root_file1/folder/ntuple
ntuple2: /file_base_dir/root_file2/folder/ntuple
friend1: /friend1_base_dir/root_file2/folder/ntuple
friend2: /friend2_base_dir/root_file2/folder/ntuple
ntuple3: /file_base_dir/root_file3/folder/ntuple
friend1: /friend1_base_dir/root_file3/folder/ntuple
friend2: /friend2_base_dir/root_file3/folder/ntuple
(...)
Args:
dataset_name (str): Name of the dataset
file_names (list): List containing the names of the .root
files
channel (str): Name of the considered channel, needed for directories
folder (str): Name of the TDirectoryFile in each .root file
files_base_directory (str): Path to the files base directory (directories)
friends_base_directories (str, list): List of paths to
the friends base directory (directories)
Returns:
dataset (Dataset): Dataset object containing TTrees
"""
def get_quantities_per_variation(path_to_root_file):
root_file = TFile(path_to_root_file)
if root_file.IsZombie():
logger.fatal("File {} does not exist, abort".format(path_to_root_file))
raise FileNotFoundError
quantities_per_vars = {}
quantities_with_variations = root_file.Get("ntuple").GetListOfLeaves()
for qwv in quantities_with_variations:
qwv_name = qwv.GetName()
if "__" in qwv_name:
quantity, var = qwv_name.split("__")
if var not in quantities_per_vars.keys():
quantities_per_vars[var] = []
quantities_per_vars[var].append(quantity)
if "-" in qwv_name:
logger.warning("Found a '-' in quantity name {} - This can result in unwanted behaviour for systematic shifts".format(qwv_name))
root_file.Close()
return quantities_per_vars
def get_full_tree_name(path_to_root_file, tree_name):
root_file = TFile(path_to_root_file)
if root_file.IsZombie():
logger.fatal("File {} does not exist, abort".format(path_to_root_file))
raise FileNotFoundError
root_file.Close()
full_tree_name = tree_name
return full_tree_name
def is_empty_file(path_to_root_file, tree_name):
root_file = TFile(path_to_root_file)
if tree_name not in [x.GetTitle() for x in root_file.GetListOfKeys()]:
return True
root_file.Close()
return False
def add_tagged_friends(friends):
"""Tag friends with the name of the different directories
in the artus name scheme, e.g.:
/common_path/MELA/ntuple -> tag: MELA
/common_path/SVFit/ntuple -> tag: SVFit
Since when we compare two ntuples (with full path) only one
directory changes in this scheme (see MELA vs SVFit), we
create a list called 'tags' with these two strings; then we
assign this string to friend.tag, if it's None
"""
for f1, f2 in itertools.combinations(friends, 2):
l1 = f1.path.split("/")
l2 = f2.path.split("/")
tags = list(set(l1).symmetric_difference(set(l2)))
if tags:
for t in tags:
if t in l1 and f1.tag is None:
f1.tag = t
elif t in l2 and f2.tag is None:
f2.tag = t
return friends
def check_validity(root_file_path, validation_dict, friends):
root_file = TFile(root_file_path)
quantities = set([x.GetName() for x in root_file.Get("ntuple").GetListOfLeaves()])
friend_quantitites = set()
for f in friends:
friend = TFile(f)
friend_quantitites.update(set([x.GetName() for x in friend.Get("ntuple").GetListOfLeaves()]))
# first we check the main ntuple, then the friends
errordata = {}
if len(validation_dict["varset"]) == 0:
validation_dict["varset"] = quantities
else:
difference = validation_dict["varset"].symmetric_difference(quantities)
if len(difference) != 0:
# error is found
errordata["file"] = root_file_path
errordata["difference"] = difference
if len(validation_dict["friends_varset"]) == 0:
validation_dict["friends_varset"] = friend_quantitites
else:
difference = validation_dict["friends_varset"].symmetric_difference(friend_quantitites)
if len(difference) != 0:
# error is found
errordata["friends"] = friends
errordata["friends_difference"] = difference
if errordata != {}:
validation_dict["errors"].append(errordata)
# files_base_directory: ntuple/era
# friends_base_directory: friends/friend_type/era
root_files = []
for f in file_names:
for g in os.listdir(os.path.join(files_base_directory, era, f, channel)):
root_files.append(
(os.path.join(files_base_directory, era, f, channel, g), f)
)
ntuples = []
if validate_samples:
logger.info("Running ntuple validation for {} - {} - {}".format(era, channel, dataset_name))
validation_dict = {
"varset": set(),
"friends_varset": set(),
"errors": []
}
valid = True
for root_file, file_name in root_files:
tdf_tree = get_full_tree_name(root_file, "ntuple")
friends = []
friend_paths = []
for friends_base_directory in friends_base_directories:
friend_base_name = os.path.basename(root_file)
friend_path = os.path.join(
friends_base_directory, era, file_name, channel, friend_base_name
)
friend_paths.append(friend_path)
tdf_tree_friend = get_full_tree_name(friend_path, "ntuple")
if tdf_tree != tdf_tree_friend:
logger.fatal(
"Extracted wrong TDirectoryFile from friend which is not the same than the base file."
)
raise Exception
if not is_empty_file(friend_path, tdf_tree):
friends.append(Ntuple(friend_path, tdf_tree_friend))
if not is_empty_file(root_file, tdf_tree):
ntuples.append(Ntuple(root_file, tdf_tree, add_tagged_friends(friends)))
if validate_samples:
check_validity(root_file, validation_dict, friend_paths)
if len(validation_dict["errors"]) != 0:
logger.fatal("Validation for {} - {} - {} failed, differences were found".format(era, channel, dataset_name))
for i, error in enumerate(validation_dict["errors"]):
if "difference" in error:
if len(error["difference"]) != 0:
logger.fatal("File {} has the following differences:".format(error["file"]))
logger.fatal("\t{}".format(error["difference"]))
if "friends_difference" in error:
if len(error["friends_difference"]) != 0:
logger.fatal("Friends {} have the following differences:".format(error["friends"]))
logger.fatal("\t{}".format(error["friends_difference"]))
else:
logger.info("Validation for {} - {} - {} passed".format(era, channel, dataset_name))
quantities_per_vars = get_quantities_per_variation(root_files[0][0])
return Dataset(dataset_name, ntuples, quantities_per_vars=quantities_per_vars)
class Unit:
"""
Building element of a minimal analysis flow, consisting
of a dataset, a set of selections to apply on the data
and a set of actions.
Args:
dataset (Dataset): Set of TTree objects to run the
analysis on
selections (list): List of Selection-type objects
actions (Action): Actions to perform on the processed
dataset, can be 'Histogram' or 'Count'
variation (Variation): Variations applied, meaning
that this selection is the result of a variation
applied on other selections
Attributes:
dataset (Dataset): Set of TTree objects to run the
analysis on
selections (list): List of Selection-type objects
actions (Action): Actions to perform on the processed
dataset, can be 'Histogram' or 'Count'
variation (Variation): Variations applied, meaning
that this selection is the result of a variation
applied on other selections
"""
def __init__(self, dataset, selections, actions, variation=None):
self.__set_dataset(dataset)
self.__set_selections(selections)
self.__set_actions(actions, variation)
def __str__(self):
layout = "\n".join(
[
"Dataset: {}".format(self.dataset.name),
"Selections: {}".format(self.selections),
"Actions: {}".format(self.actions),
]
)
return layout
def __set_dataset(self, dataset):
if not isinstance(dataset, Dataset):
raise TypeError("not a Dataset object.")
self.dataset = dataset
def __set_selections(self, selections):
if not isinstance(selections, list):
raise TypeError("not a list object.")
for selection in selections:
if not isinstance(selection, Selection):
raise TypeError("not a Selection object.")
self.selections = selections
def __set_actions(self, actions, variation):
if not isinstance(actions, list):
raise TypeError("not a list object.")
for action in actions:
if not isinstance(action, Action):
raise TypeError("not an Action object.")
self.actions = [self.__set_new_action(action, variation) for action in actions]
def __set_new_action(self, action, variation):
if variation is None:
name = "#".join(
[
self.dataset.name,
"-".join([selection.name for selection in self.selections]),
"Nominal",
action.name,
]
)
else:
if not isinstance(variation, Variation):
raise TypeError("not a Variation object.")
self.variation = variation
name = action.name.replace("Nominal", self.variation.name)
if isinstance(action, Histogram):
return Histogram(name, action.variable, action.edges)
elif isinstance(action, Count):
return Count(name, action.variable)
def __eq__(self, other):
return (
self.dataset == other.dataset
and self.selections == other.selections
and self.actions == other.actions
)
def __hash__(self):
return hash((self.dataset, tuple(self.selections), tuple(self.actions)))
class UnitManager:
"""
Manager of all the Unit objects that are created.
It can both be initialized with a variable amount of Unit
objects as arguments or with no arguments, with the above mentioned
objects added in a second time with the function 'book'.
Attributes:
booked_units (list): List of the booked units, updated during
initialization or with the function 'book'
"""
booked_units = []
def book(self, units, variations=None, enable_check=True):
for unit in units:
if unit not in self.booked_units:
self.booked_units.append(unit)
if variations:
for variation in variations:
logger.debug("Applying variation {}".format(variation))
for unit in units:
self.apply_variation(unit, variation)
if enable_check:
for action1, action2 in itertools.combinations(
[j for i in [unit.actions for unit in self.booked_units] for j in i], 2
):
if action1.name == action2.name:
logger.fatal(
"Caught two actions with same name ({}, {})".format(
action1.name, action2.name
)
)
raise NameError
def apply_variation(self, unit, variation):
new_unit = variation.create(unit)
self.booked_units.append(new_unit)