Machine-Learning-for-Medical-Language · spencerthomas1722 · Nov 12, 2024 · Nov 13, 2024 · Nov 13, 2024 · Dec 4, 2024
diff --git a/src/cnlpt/cnlp_processors.py b/src/cnlpt/cnlp_processors.py
@@ -167,7 +167,7 @@ def __init__(self, data_dir: str, tasks: set[str] = None, max_train_items=-1):
             else:
                 sep = "\t"
 
-            self.dataset = load_dataset("csv", sep=sep, data_files=data_files)
+            self.dataset = load_dataset("csv", sep=sep, data_files=data_files, keep_default_na=False)
 
             ## find out what tasks are available to this dataset, and see the overlap with what the
             ## user specified at the cli, remove those tasks so we don't also get them from other datasets

diff --git a/src/cnlpt/train_system.py b/src/cnlpt/train_system.py
@@ -309,7 +309,10 @@ def main(
                 dataset.tasks_to_labels[task] = dataset.tasks_to_labels[task][1:] + [
                     dataset.tasks_to_labels[task][0]
                 ]
-            labels = dataset.processed_dataset["train"][task]
+            if tagger[task]:
+                labels = [token_label for sent in dataset.processed_dataset["train"][task] for token_label in sent.split()]
+            else:
+                labels = dataset.processed_dataset["train"][task]
             weights = []
             label_counts = Counter(labels)
             for label in dataset.tasks_to_labels[task]:
@@ -478,25 +481,13 @@ def main(
                 # in this case we're looking at a fine-tuned model (?)
                 character_level=data_args.character_level,
             )
-
             if training_args.do_train:
                 # Setting 1) only load weights from the encoder
-                raise NotImplementedError(
-                    "This functionality has not been restored yet"
-                )
                 model = CnlpModelForClassification(
-                    model_path=model_args.encoder_name,
                     config=config,
-                    cache_dir=model_args.cache_dir,
-                    tagger=tagger,
-                    relations=relations,
                     class_weights=dataset.class_weights,
                     final_task_weight=training_args.final_task_weight,
-                    use_prior_tasks=model_args.use_prior_tasks,
-                    argument_regularization=model_args.arg_reg,
                 )
-                delattr(model, "classifiers")
-                delattr(model, "feature_extractors")
                 if training_args.do_train:
                     tempmodel = tempfile.NamedTemporaryFile(dir=model_args.cache_dir)
                     torch.save(model.state_dict(), tempmodel)
@@ -511,7 +502,6 @@ def main(
                     freeze=training_args.freeze,
                     bias_fit=training_args.bias_fit,
                 )
-
         else:
             # This only works when model_args.encoder_name is one of the
             # model card from https://huggingface.co/models
@@ -675,7 +665,7 @@ def compute_metrics_fn(p: EvalPrediction):
                     model.best_eval_results = metrics
                     if trainer.is_world_process_zero():
                         if training_args.do_train:
-                            trainer.save_model()
+                            trainer.save_model()  # NOTE: a RobertaConfig is loaded here. why?
                             tokenizer.save_pretrained(training_args.output_dir)
                             if model_name == "cnn" or model_name == "lstm":
                                 with open(
@@ -884,7 +874,7 @@ def compute_metrics_fn(p: EvalPrediction):
 
                 out_table = process_prediction(
                     task_names=dataset.tasks,
-                    error_analysis=False,
+                    error_analysis=training_args.error_analysis,
                     output_prob=training_args.output_prob,
                     character_level=data_args.character_level,
                     task_to_label_packet=task_to_label_packet,