Remove need to specify scanner strenght

mathrip · mathrip · commit 89a3c22d4660 · 2025-04-09T11:03:30.000+01:00
diff --git a/docs/FAQs.md b/docs/FAQs.md
@@ -50,6 +50,17 @@ Solution:
 
 ## **Issues & questions with pipeline use**
 
+### **I have an issue with FLAIR feature that does not exist**
+
+If you are running a subject with only a T1 scan and no FLAIR scan but you receive an issue like :
+```bash
+KeyError: "Unable to open object (object '.on_lh.gm_FLAIR_0.25.sm3.mgh' doesn't exist)"
+exit status 1
+```
+You are likely having this issue because you might have previously ran this same subject ID with a FLAIR scan and the FreeSurfer segmentation has been done using the FLAIR scan. Therefore, even if you remove the FLAIR scan from the input data and run again the command, the intermediate FreeSurfer outputs for that subject still contain FLAIR information, which will make the pipeline looks for for FLAIR feature but fail to find it.
+
+To avoid this in the future, if you want to run a same subject with and without FLAIR, you should create two separate input folders with two different subject's ID such as `sub-0001noflair` and `sub-0001flair`.
+
 ### **I have an issue during the harmonisation**
 
 If your issue looks like :
diff --git a/docs/images/example_demographic_csv.png b/docs/images/example_demographic_csv.png
diff --git a/docs/prepare_data.md b/docs/prepare_data.md
@@ -63,7 +63,6 @@ You can copy the *demographics_file.csv* that you can find in your <meld_data_fo
 - Group: 'patient' if the subject is a patient or 'control' if the subject is a control 
 - Age at preoperative: The age of the subject at the time of the preoperative T1 scan (in years)
 - Sex: 'male' if the subject is a male or 'female' if the subject is a female
-- Scanner: the scanner strenght associated with the MRI data ('3T' for 3 Tesla or '15T' for 1.5 Tesla)
 
 ### Warning 
 - please ensure the column names are unchanged and completed with the appropriate values, otherwise the pipeline will fail.
diff --git a/meld_graph/data_preprocessing.py b/meld_graph/data_preprocessing.py
@@ -563,7 +563,6 @@ def make_boundary_zones(self, smoothing=0, boundary_feature_name=".on_lh.boundar
                     
     def load_covars(self, subject_ids=None, demographic_file=DEMOGRAPHIC_FEATURES_FILE):
         # if not os.path.isfile(demographic_file):
-        # demographic_file = os.path.join(self.data_dir,demographic_file)
         if subject_ids is None:
             subject_ids = self.subject_ids
         covars = pd.DataFrame()
@@ -584,7 +583,7 @@ def load_covars(self, subject_ids=None, demographic_file=DEMOGRAPHIC_FEATURES_FI
             else:
                 print(f'ERROR: There is an issue with the coded sex of subject {subject}')
             group.append(subj.is_patient)
-            sites_scanners.append(subj.site_code + "_" + subj.scanner)
+            sites_scanners.append(subj.site_code) # just site code now
             
         covars["ages"] = ages
         covars["sex"] = sex
@@ -652,7 +651,7 @@ def shrink_combat_estimates(self, estimates):
     def unshrink_combat_estimates(self, estimates):
         """ unshrink combat estimates to use as input in neuroCombatFromTraining"""
         num_subjects = estimates['num_subjects'][0]
-        mod_mean = np.zeros((len(estimates['stand.mean']),num_subjects ))
+        mod_mean = np.zeros((len(estimates['stand.mean']),num_subjects))
         estimates['mod.mean'] = mod_mean
         estimates['stand.mean'] = np.tile(estimates['stand.mean'], (num_subjects,1)).T
         return estimates
@@ -765,7 +764,7 @@ def get_combat_new_site_parameters(
             return
         # load in covariates - age, sex, group, site and scanner unless provided    
         new_site_covars = self.load_covars(subject_ids=np.array(listids)[np.array(combat_subject_include)], demographic_file=demographic_file).copy()
-        #check site_scanner codes are the same for all subjects
+        # check site_scanner codes are the same for all subjects
         if len(new_site_covars['site_scanner'].unique())==1:
             site_scanner = new_site_covars['site_scanner'].unique()[0]
         else:
@@ -854,12 +853,13 @@ def combat_new_subject(self, feature_name, combat_params_file):
                 rh = subj.load_feature_values(feature_name, hemi="rh")[self.cohort.cortex_mask]
                 combined_hemis = np.hstack([lh, rh])
                 precombat_features.append(combined_hemis)
-                site_scanner.append(subj.site_code + "_" + subj.scanner)
+                site_scanner.append(subj.site_code) # just site code now
                 subjects_included.append(subject)
         #if matrix empty, pass
         if precombat_features:
             combat_estimates = self.read_norm_combat_parameters(feature_name, combat_params_file)
             combat_estimates = self.unshrink_combat_estimates(combat_estimates)
+            combat_estimates["batches"] = [x.split('_')[0] for x in combat_estimates["batches"]] # remove scanner strenght from the batch code if exist
             precombat_features = np.array(precombat_features)
             site_scanner = np.array(site_scanner)
             dict_combat = neuroCombatFromTraining(dat=precombat_features.T, batch=site_scanner, estimates=combat_estimates)
@@ -1224,7 +1224,7 @@ def intra_inter_subject(self, feature, cohort_for_norm=None, params_norm=None):
             else:
                 included_subjects[k] = False
                 controls_subjects[k] = False
-        print(f"INFO: exlude subjects {np.array(self.subject_ids)[~included_subjects]}")
+        print(f"INFO: exclude subjects {np.array(self.subject_ids)[~included_subjects]}")
         if vals_array:
             vals_array = np.array(vals_array)
             # remove exclude subjects
@@ -1287,7 +1287,7 @@ def asymmetry_subject(self, feature, cohort_for_norm=None, params_norm=None):
             else:
                 included_subjects[k] = False
                 controls_subjects[k] = False
-            print(f"INFO: exlude subjects {np.array(self.subject_ids)[~included_subjects]}")
+        print(f"INFO: exclude subjects {np.array(self.subject_ids)[~included_subjects]}")
         if vals_asym_array :
             vals_asym_array = np.array(vals_asym_array)
             # remove exclude subjects
diff --git a/meld_graph/download_data.py b/meld_graph/download_data.py
@@ -20,7 +20,7 @@ def download_test_data():
     """
     Download test data from figshare
     """
-    url = "https://figshare.com/ndownloader/files/50432751?private_link=3b790cfb027f4036f19a"
+    url = "https://figshare.com/ndownloader/files/53523443?private_link=413bc45083e67c7e7a11"
     test_data_dir = MELD_DATA_PATH
     os.makedirs(test_data_dir, exist_ok=True)
     print('downloading test data to '+ test_data_dir)
diff --git a/meld_graph/meld_cohort.py b/meld_graph/meld_cohort.py
@@ -240,7 +240,7 @@ def get_subject_ids(self, **kwargs):
         if isinstance(site_codes, str):
             site_codes = [site_codes]
         # get scanners
-        scanners = kwargs.get("scanners", ["3T", "15T"])
+        scanners = kwargs.get("scanners", ["3T", "15T", "XT"])
         if not isinstance(scanners, list):
             scanners = [scanners]
 
@@ -387,16 +387,16 @@ def __init__(self, subject_id, cohort):
 
     @property
     def scanner(self):
+        # Note: no need to specify scanner strength with MELD Graph pipeline, but still need it to be compatible with previous MELD FCD dataset
         scanner = self.get_demographic_features('Scanner')
+        if scanner is None:
+            scanner="XT" #no need to specify 
         if scanner in ("15T" , "1.5T" , "15t" , "1.5t" ):
-            scanner="15T"
+            scanner="15T" # to be compatible with old way
         elif scanner in ("3T" , "3t" ):
-            scanner="3T"
+            scanner="3T" # to be compatible with old way
         else:
-            print(
-                f"Error: incorrect scanner for {self.subject_id}. Unable to determine if scanner 15T or 3T "
-            )
-            sys.exit()
+            scanner="XT" #no need to specify 
         return scanner
 
     @property
@@ -418,6 +418,12 @@ def surf_dir_path(self, hemi):
         """return path to features dir (surf_dir)"""
         return os.path.join(self.site_code, self.scanner, self.group, self.subject_id, hemi)
 
+    def find_path(self, name):
+        """ Find the first object with the subject id in the hdf5"""
+        if self.subject_id in name:
+            return name    
+    
+    
     @property
     def is_patient(self):
         return self.group == "patient"
@@ -437,10 +443,10 @@ def get_lesion_hemisphere(self):
             return None
 
         with self.cohort._site_hdf5(self.site_code, self.group) as f:
-            surf_dir_lh = f.require_group(self.surf_dir_path("lh"))
+            surf_dir_lh = f[os.path.join(self.site_code, f[self.site_code].visit(self.find_path), "lh")]
             if ".on_lh.lesion.mgh" in surf_dir_lh.keys():
                 return "lh"
-            surf_dir_rh = f.require_group(self.surf_dir_path("rh"))
+            surf_dir_rh = f[os.path.join(self.site_code, f[self.site_code].visit(self.find_path), "rh")]
             if ".on_lh.lesion.mgh" in surf_dir_rh.keys():
                 return "rh"
         return None
@@ -452,7 +458,8 @@ def has_features(self, features):
     def get_feature_list(self, hemi="lh"):
         """Outputs a list of the features a participant has for each hemisphere"""
         with self.cohort._site_hdf5(self.site_code, self.group) as f:
-            keys = list(f[self.surf_dir_path(hemi)].keys())
+            surf_dir_path = os.path.join(self.site_code, f[self.site_code].visit(self.find_path), hemi)
+            keys =  list(f[surf_dir_path].keys())
             # remove lesion and boundaries from list of features
             if ".on_lh.lesion.mgh" in keys:
                 keys.remove(".on_lh.lesion.mgh")
@@ -514,6 +521,8 @@ def get_demographic_features(
 
                 if "urfer" in desired_name:
                     matched_name = "Freesurfer_nul"
+                elif "Scanner" in desired_name:
+                    return None
                 else:
                     self.log.warning(f"Unable to find column matching {desired_name}, please double check for typos")
                     return None
@@ -551,7 +560,7 @@ def load_feature_values(self, feature, hemi="lh"):
         feature_values = np.zeros(NVERT, dtype=np.float32)
         # read data from hdf5
         with self.cohort._site_hdf5(self.site_code, self.group) as f:
-            surf_dir = f[self.surf_dir_path(hemi)]
+            surf_dir = f[os.path.join(self.site_code, f[self.site_code].visit(self.find_path), hemi)]
             if feature in surf_dir.keys():
                 feature_values[:] = surf_dir[feature][:]
             else:
diff --git a/meld_graph/tools_pipeline.py b/meld_graph/tools_pipeline.py
@@ -107,7 +107,7 @@ def create_demographic_file(subjects_ids, save_file, harmo_code='noHarmo'):
     df['ID']=subjects_ids.astype(str)
     df['Harmo code']=[str(harmo_code) for subject in subjects_ids]
     df['Group']=['patient' for subject in subjects_ids]
-    df['Scanner']=['3T' for subject in subjects_ids]
+    df['Scanner']=['XT' for subject in subjects_ids]
     df.to_csv(save_file)
     
 def create_dataset_file(subjects_ids, save_file):
diff --git a/scripts/data_preparation/extract_features/io_meld.py b/scripts/data_preparation/extract_features/io_meld.py
@@ -78,9 +78,9 @@ def load_subject_features(fs_id,features,subject_number,medial_wall,subjects_dir
 
 def get_group_site(fs_id, csv_path):
         """
-        Read demographic features from csv file and extract group, sex and scanner 
+        Read demographic features from csv file and extract harmo code and group  
         """
-        features_name=["Harmo code", "Group", "Scanner"]
+        features_name=["Harmo code", "Group"]
         df = pd.read_csv(csv_path, header=0, encoding="latin")
         # get index column
         id_col = None
@@ -124,14 +124,9 @@ def save_subject(fs_id,features,medial_wall,subject_dir, demographic_file,  outp
     failed=False
     n_vert=163842
     #get subject info from id
-    site_code, c_p, scanner = get_group_site(fs_id, demographic_file)
-    if scanner in ("15T" , "1.5T" , "15t" , "1.5t" ):
-        scanner="15T"
-    elif scanner in ("3T" , "3t" ):
-        scanner="3T"
-    else:
-        print('scanner for subject '+ fs_id + ' cannot be identified as either 1.5T or 3T...')
-        scanner='false'
+    site_code, c_p = get_group_site(fs_id, demographic_file)
+    print('scanner for subject '+ fs_id + 'is set as default XT')
+    scanner='XT'
     #skip subject if info not available
     if 'false' in (c_p, scanner, site_code):
         print("Skipping subject " + fs_id)