python-chebai/tests/unit/dataset_classes/testTox21MolNet.py at e0a794ef76fd793b60469ebd88b539ea2e0bc410 · ChEB-AI/python-chebai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import unittest
from typing import List
from unittest.mock import MagicMock, mock_open, patch

import torch

from chebai.preprocessing.datasets.tox21 import Tox21MolNet
from chebai.preprocessing.reader import ChemDataReader
from tests.unit.mock_data.tox_mock_data import Tox21MolNetMockData


class TestTox21MolNet(unittest.TestCase):
    @classmethod
    @patch("os.makedirs", return_value=None)
    def setUpClass(cls, mock_makedirs: MagicMock) -> None:
        """
        Initialize a Tox21MolNet instance for testing.

        Args:
            mock_makedirs (MagicMock): Mocked `os.makedirs` function.
        """
        Tox21MolNet.READER = ChemDataReader
        cls.data_module = Tox21MolNet()

    @patch(
        "builtins.open",
        new_callable=mock_open,
        read_data=Tox21MolNetMockData.get_raw_data(),
    )
    def test_load_data_from_file(self, mock_open_file: mock_open) -> None:
        """
        Test the `_load_data_from_file` method for correct output.

        Args:
            mock_open_file (mock_open): Mocked open function to simulate file reading.
        """
        actual_data = self.data_module._load_data_from_file("fake/file/path.csv")

        first_instance = next(actual_data)

        # Check for required keys
        required_keys = ["features", "labels", "ident"]
        for key in required_keys:
            self.assertIn(
                key, first_instance, f"'{key}' key is missing in the output data."
            )

        self.assertTrue(
            all(isinstance(feature, int) for feature in first_instance["features"]),
            "Not all elements in 'features' are integers.",
        )

        # Check that 'features' can be converted to a tensor
        features = first_instance["features"]
        try:
            tensor_features = torch.tensor(features)
            self.assertTrue(
                tensor_features.ndim > 0,
                "'features' should be convertible to a non-empty tensor.",
            )
        except Exception as e:
            self.fail(f"'features' cannot be converted to a tensor: {str(e)}")

    @patch(
        "builtins.open",
        new_callable=mock_open,
        read_data=Tox21MolNetMockData.get_raw_data(),
    )
    @patch("torch.save")
    def test_setup_processed_simple_split(
        self,
        mock_torch_save: MagicMock,
        mock_open_file: mock_open,
    ) -> None:
        """
        Test the `setup_processed` method for basic data splitting and saving.

        Args:
            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
            mock_open_file (mock_open): Mocked `open` function to simulate file reading.
        """
        self.data_module.setup_processed()

        # Verify if torch.save was called for each split (train, test, validation)
        self.assertEqual(
            mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times."
        )
        call_args_list = mock_torch_save.call_args_list
        self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.")
        self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.")
        self.assertIn(
            "validation", call_args_list[2][0][1], "Missing 'validation' split."
        )

        # Check for non-overlap between train, test, and validation splits
        test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]]
        train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]]
        validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]]

        self.assertTrue(
            set(train_split).isdisjoint(test_split),
            "Overlap detected between the train and test splits.",
        )
        self.assertTrue(
            set(train_split).isdisjoint(validation_split),
            "Overlap detected between the train and validation splits.",
        )
        self.assertTrue(
            set(test_split).isdisjoint(validation_split),
            "Overlap detected between the test and validation splits.",
        )

    @patch.object(
        Tox21MolNet,
        "_load_data_from_file",
        return_value=Tox21MolNetMockData.get_processed_grouped_data(),
    )
    @patch("torch.save")
    def test_setup_processed_with_group_split(
        self, mock_torch_save: MagicMock, mock_load_file: MagicMock
    ) -> None:
        """
        Test the `setup_processed` method for group-based splitting and saving.

        Args:
            mock_torch_save (MagicMock): Mocked `torch.save` function to avoid actual file writes.
            mock_load_file (MagicMock): Mocked `_load_data_from_file` to provide custom data.
        """
        self.data_module.train_split = 0.5
        self.data_module.setup_processed()

        # Verify if torch.save was called for each split
        self.assertEqual(
            mock_torch_save.call_count, 3, "Expected torch.save to be called 3 times."
        )
        call_args_list = mock_torch_save.call_args_list
        self.assertIn("test", call_args_list[0][0][1], "Missing 'test' split.")
        self.assertIn("train", call_args_list[1][0][1], "Missing 'train' split.")
        self.assertIn(
            "validation", call_args_list[2][0][1], "Missing 'validation' split."
        )

        # Check for non-overlap between train, test, and validation splits (based on 'ident')
        test_split: List[str] = [d["ident"] for d in call_args_list[0][0][0]]
        train_split: List[str] = [d["ident"] for d in call_args_list[1][0][0]]
        validation_split: List[str] = [d["ident"] for d in call_args_list[2][0][0]]

        self.assertTrue(
            set(train_split).isdisjoint(test_split),
            "Overlap detected between the train and test splits (based on 'ident').",
        )
        self.assertTrue(
            set(train_split).isdisjoint(validation_split),
            "Overlap detected between the train and validation splits (based on 'ident').",
        )
        self.assertTrue(
            set(test_split).isdisjoint(validation_split),
            "Overlap detected between the test and validation splits (based on 'ident').",
        )

        # Check for non-overlap between train, test, and validation splits (based on 'group')
        test_split_grp: List[str] = [d["group"] for d in call_args_list[0][0][0]]
        train_split_grp: List[str] = [d["group"] for d in call_args_list[1][0][0]]
        validation_split_grp: List[str] = [d["group"] for d in call_args_list[2][0][0]]

        self.assertTrue(
            set(train_split_grp).isdisjoint(test_split_grp),
            "Overlap detected between the train and test splits (based on 'group').",
        )
        self.assertTrue(
            set(train_split_grp).isdisjoint(validation_split_grp),
            "Overlap detected between the train and validation splits (based on 'group').",
        )
        self.assertTrue(
            set(test_split_grp).isdisjoint(validation_split_grp),
            "Overlap detected between the test and validation splits (based on 'group').",
        )


if __name__ == "__main__":
    unittest.main()