StratosphereLinuxIPS/modules/flowmldetection/flowmldetection.py at cd1154a5cd00d0409848cecc0f165c61dca3e61f · stratosphereips/StratosphereLinuxIPS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
from slips_files.common.abstracts import Module
import multiprocessing
from slips_files.core.database.database import __database__
from slips_files.common.config_parser import ConfigParser
from slips_files.common.slips_utils import utils
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
import json
import datetime
import traceback
# Only for debbuging
# from matplotlib import pyplot as plt


# This horrible hack is only to stop sklearn from printing those warnings
def warn(*args, **kwargs):
    pass


import warnings

warnings.warn = warn


class Module(Module, multiprocessing.Process):
    # Name: short name of the module. Do not use spaces
    name = 'Flow ML Detection'
    description = (
        'Train or test a Machine Learning model to detect malicious flows'
    )
    authors = ['Sebastian Garcia']

    def __init__(self, outputqueue, redis_port):
        multiprocessing.Process.__init__(self)
        self.outputqueue = outputqueue
        __database__.start(redis_port)
        # Subscribe to the channel
        self.c1 = __database__.subscribe('new_flow')
        self.fieldseparator = __database__.getFieldSeparator()
        # Set the output queue of our database instance
        __database__.setOutputQueue(self.outputqueue)
        # Read the configuration
        self.read_configuration()
        # Minum amount of new lables needed to trigger the train
        self.minimum_lables_to_retrain = 50
        # To plot the scores of training
        # self.scores = []
        # The scaler trained during training and to use during testing
        self.scaler = StandardScaler()

    def read_configuration(self):
        conf = ConfigParser()
        self.mode = conf.get_ml_mode()

    def print(self, text, verbose=1, debug=0):
        """
        Function to use to print text using the outputqueue of slips.
        Slips then decides how, when and where to print this text by taking all the processes into account
        :param verbose:
            0 - don't print
            1 - basic operation/proof of work
            2 - log I/O operations and filenames
            3 - log database/profile/timewindow changes
        :param debug:
            0 - don't print
            1 - print exceptions
            2 - unsupported and unhandled types (cases that may cause errors)
            3 - red warnings that needs examination - developer warnings
        :param text: text to print. Can include format like 'Test {}'.format('here')
        """

        levels = f'{verbose}{debug}'
        self.outputqueue.put(f'{levels}|{self.name}|{text}')

    def train(self):
        """
        Train a model based on the flows we receive and the labels
        """
        try:
            # Process the labels to have only Normal and Malware
            self.flows.label = self.flows.label.str.replace(
                r'(^.*ormal.*$)', 'Normal'
            )
            self.flows.label = self.flows.label.str.replace(
                r'(^.*alware.*$)', 'Malware'
            )
            self.flows.label = self.flows.label.str.replace(
                r'(^.*alicious.*$)', 'Malware'
            )

            # Separate
            y_flow = self.flows['label']
            X_flow = self.flows.drop('label', axis=1)
            X_flow = X_flow.drop('module_labels', axis=1)

            # Normalize this batch of data so far. This can get progressivle slow
            X_flow = self.scaler.fit_transform(X_flow)

            # Train
            try:
                self.clf.partial_fit(
                    X_flow, y_flow, classes=['Malware', 'Normal']
                )
            except Exception as ex:
                self.print('Error while calling clf.train()')
                self.print(traceback.print_exc())

            # See score so far in training
            score = self.clf.score(X_flow, y_flow)

            # To debug the training score
            # self.scores.append(score)

            self.print(f'	Training Score: {score}', 0, 1)
            # self.print(f'    Model Parameters: {self.clf.coef_}')

            # Debug code to store a plot in a png of the scores
            # plt.plot(self.scores)
            # plt.savefig('train-scores.png')

            # Store the models on disk
            self.store_model()

        except Exception as inst:
            self.print('Error in train()', 0 , 1)
            self.print(traceback.print_exc(), 0, 1)


    def process_features(self, dataset):
        """
        Discards some features of the dataset and can create new.
        Clean the dataset
        """
        try:
            # Discard some type of flows that dont have ports
            to_discard = ['arp', 'ARP', 'icmp', 'igmp', 'ipv6-icmp']
            for proto in to_discard:
                dataset = dataset[dataset.proto != proto]

            # For now, discard the ports
            to_drop = [
                'appproto' ,
                'daddr',
                'saddr',
                'ts',
                'origstate',
                'flow_type' ,
                'smac',
                'dmac',
            ]
            for field in to_drop:
                try:
                    dataset = dataset.drop(field, axis=1)
                except ValueError:
                    pass

            # Convert state to categorical
            dataset.state = dataset.state.str.replace(
                r'(^.*NotEstablished.*$)', '0'
            )
            dataset.state = dataset.state.str.replace(
                r'(^.*Established.*$)', '1'
            )
            dataset.state = dataset.state.astype('float64')

            # Convert proto to categorical. For now we only have few states, so we can hardcode...
            # We dont use the data to create categories because in testing mode
            # we dont see all the protocols
            # Also we dont store the Categorizer because the user can retrain
            # with its own data.
            dataset.proto = dataset.proto.str.lower()
            dataset.proto = dataset.proto.str.replace(r'(^.*tcp.*$)', '0')
            dataset.proto = dataset.proto.str.replace(r'(^.*udp.*$)', '1')
            dataset.proto = dataset.proto.str.replace(r'(^.*icmp.*$)', '2')
            dataset.proto = dataset.proto.str.replace(
                r'(^.*icmp-ipv6.*$)', '3'
            )
            dataset.proto = dataset.proto.str.replace(r'(^.*arp.*$)', '4')
            dataset.proto = dataset.proto.astype('float64')
            try:
                # Convert dport to float
                dataset.dport = dataset.dport.astype('float')
            except ValueError:
                pass
            try:
                # Convert sport to float
                dataset.sport = dataset.sport.astype('float')
            except ValueError:
                pass
            try:
                # Convert Dur to float
                dataset.dur = dataset.dur.astype('float')
            except ValueError:
                pass
            try:
                # Convert TotPkts to float
                dataset.pkts = dataset.pkts.astype('float')
            except ValueError:
                pass
            try:
                # Convert SrcPkts to float
                dataset.spkts = dataset.spkts.astype('float')
            except ValueError:
                pass
            try:
                # Convert TotBytes to float
                dataset.allbytes = dataset.allbytes.astype('float')
            except ValueError:
                pass
            try:
                # Convert SrcBytes to float
                dataset.sbytes = dataset.sbytes.astype('float')
            except ValueError:
                pass
            return dataset
        except Exception as ex:
            # Stop the timer
            self.print('Error in process_features()')
            self.print(traceback.print_exc(),0,1)

    def process_flows(self):
        """
        Process all the flwos in the DB
        Store the pandas df in self.flows
        """
        try:
            # We get all the flows so far
            # because this retraining happens in batches
            flows = __database__.get_all_flows()

            # Check how many different labels are in the DB
            # We need both normal and malware
            labels = __database__.get_labels()
            if len(labels) == 1:
                # Only 1 label has flows
                # There are not enough different labels, so insert two flows
                # that are fake but representative of a normal and malware flow
                # they are only for the training process
                # At least 1 flow of each label is required
                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
                flows.append(
                    {
                        'ts': 1594417039.029793,
                        'dur': '1.9424750804901123',
                        'saddr': '10.7.10.101',
                        'sport': '49733',
                        'daddr': '40.70.224.145',
                        'dport': '443',
                        'proto': 'tcp',
                        'origstate': 'SRPA_SPA',
                        'state': 'Established',
                        'pkts': 84,
                        'allbytes': 42764,
                        'spkts': 37,
                        'sbytes': 25517,
                        'appproto': 'ssl',
                        'label': 'Malware',
                        'module_labels': {
                            'flowalerts-long-connection': 'Malware'
                        },
                    }
                )
                flows.append(
                    {
                        'ts': 1382355032.706468,
                        'dur': '10.896695',
                        'saddr': '147.32.83.52',
                        'sport': '47956',
                        'daddr': '80.242.138.72',
                        'dport': '80',
                        'proto': 'tcp',
                        'origstate': 'SRPA_SPA',
                        'state': 'Established',
                        'pkts': 67,
                        'allbytes': 67696,
                        'spkts': 1,
                        'sbytes': 100,
                        'appproto': 'http',
                        'label': 'Normal',
                        'module_labels': {
                            'flowalerts-long-connection': 'Normal'
                        },
                    }
                )
                # If there are enough flows, we dont insert them anymore

            # Convert to pandas df
            df_flows = pd.DataFrame(flows)

            # Process features
            df_flows = self.process_features(df_flows)

            # Update the flow to the processed version
            self.flows = df_flows
        except Exception as ex:
            # Stop the timer
            self.print('Error in process_flows()')
            self.print(traceback.print_exc(),0,1)

    def process_flow(self):
        """
        Process one flow. Only used during detection in testing
        Store the pandas df in self.flow
        """
        try:
            # Convert the flow to a pandas dataframe
            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
            # Process features
            dflow = self.process_features(raw_flow)
            # Update the flow to the processed version
            self.flow = dflow
        except Exception as inst:
            # Stop the timer
            self.print('Error in process_flow()')
            self.print(traceback.print_exc(),0,1)

    def detect(self):
        """
        Detect this flow with the current model stored
        """
        try:
            # Store the real label if there is one
            y_flow = self.flow['label']
            # remove the real label column
            self.flow = self.flow.drop('label', axis=1)
            # remove the label predictions column of the other modules
            X_flow = self.flow.drop('module_labels', axis=1)
            # Scale the flow
            X_flow = self.scaler.transform(X_flow)
            pred = self.clf.predict(X_flow)
            return pred
        except Exception as inst:
            # Stop the timer
            self.print('Error in detect() X_flow:')
            self.print(X_flow)
            self.print(traceback.print_exc(),0,1)

def store_model(self):
    """
    Store the trained model on disk if it has been updated
    """
    if self.model_updated:
        self.print(f'Storing the updated trained model and scaler on disk.', 0, 2)
        with open('./modules/flowmldetection/model.bin', 'wb') as f:
            pickle.dump(self.clf, f)
        with open('./modules/flowmldetection/scaler.bin', 'wb') as g:
            pickle.dump(self.scaler, g)
        self.model_updated = False
    else:
        self.print(f'Trained model has not been updated. Not storing on disk.', 0, 2)


    def read_model(self):
        """
        Read the trained model from disk
        """
        try:
            self.print(f'Reading the trained model from disk.', 0, 2)
            f = open('./modules/flowmldetection/model.bin', 'rb')
            self.clf = pickle.load(f)
            f.close()
            self.print(f'Reading the trained scaler from disk.', 0, 2)
            g = open('./modules/flowmldetection/scaler.bin', 'rb')
            self.scaler = pickle.load(g)
            g.close()
        except FileNotFoundError:
            # If there is no model, create one empty
            self.print('There was no model. Creating a new empty model.', 0, 2)
            self.clf = SGDClassifier(
                warm_start=True, loss='hinge', penalty='l1'
            )
        except EOFError:
            self.print(
                'Error reading model from disk. Creating a new empty model.',
                0,
                2,
            )
            self.clf = SGDClassifier(
                warm_start=True, loss='hinge', penalty='l1'
            )

    def set_evidence_malicious_flow(
        self, saddr, sport, daddr, dport, profileid, twid, uid
    ):
        """
        Set the evidence that a flow was detected as malicious
        """
        confidence = 0.1
        threat_level = 'low'
        attacker_direction = 'flow'
        category = 'Anomaly.Traffic'
        attacker = (
            str(saddr) + ':' + str(sport) + '-' + str(daddr) + ':' + str(dport)
        )
        evidence_type = 'MaliciousFlow'
        ip_identification = __database__.getIPIdentification(daddr)
        description = f'Malicious flow by ML. Src IP {saddr}:{sport} to {daddr}:{dport} {ip_identification}'
        timestamp = utils.convert_format(datetime.datetime.now(), utils.alerts_format)
        __database__.setEvidence(evidence_type, attacker_direction, attacker, threat_level, confidence, description,
                                 timestamp, category, profileid=profileid, twid=twid)

    def shutdown_gracefully(self):
        # Confirm that the module is done processing
        self.store_model()
        __database__.publish('finished_modules', self.name)

    def run(self):
        utils.drop_root_privs()
        # Load the model
        self.read_model()
        while True:
            try:
                message = __database__.get_message(self.c1)

                if message and message['data'] == 'stop_process':
                    self.shutdown_gracefully()
                    return True

                if utils.is_msg_intended_for(message, 'new_flow'):
                    data = message['data']
                    # Convert from json to dict
                    data = json.loads(data)
                    profileid = data['profileid']
                    twid = data['twid']
                    # Get flow that is now in json format
                    flow = data['flow']
                    # Convert flow to a dict
                    flow = json.loads(flow)
                    # Convert the common fields to something that can
                    # be interpreted
                    # Get the uid which is the key
                    uid = next(iter(flow))
                    self.flow_dict = json.loads(flow[uid])

                    if self.mode == 'train':
                        # We are training

                        # Is the amount in the DB of labels enough to retrain?
                        # Use labeled flows
                        labels = __database__.get_labels()
                        sum_labeled_flows = sum([i[1] for i in labels])
                        if (
                            sum_labeled_flows >= self.minimum_lables_to_retrain
                            and sum_labeled_flows
                            % self.minimum_lables_to_retrain
                            == 1
                        ):
                            # We get here every 'self.minimum_lables_to_retrain' amount of labels
                            # So for example we retrain every 100 labels and only when we have at least 100 labels
                            self.print(
                                f'Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}.'
                            )
                            # Process all flows in the DB and make them ready for pandas
                            self.process_flows()
                            # Train an algorithm
                            self.train()
                    elif self.mode == 'test':
                        # We are testing, which means using the model to detect
                        self.process_flow()

                        # After processing the flow, it may happen that we delete icmp/arp/etc
                        # so the dataframe can be empty
                        if not self.flow.empty:
                            # Predict
                            pred = self.detect()
                            label = self.flow_dict['label']

                            # Report
                            if (
                                label
                                and label != 'unknown'
                                and label != pred[0]
                            ):
                                # If the user specified a label in test mode, and the label
                                # is diff from the prediction, print in debug mode
                                self.print(
                                    f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
                                    f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
                                    f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                                    0,
                                    3,
                                )
                            if pred[0] == 'Malware':
                                # Generate an alert
                                self.set_evidence_malicious_flow(
                                    self.flow_dict['saddr'],
                                    self.flow_dict['sport'],
                                    self.flow_dict['daddr'],
                                    self.flow_dict['dport'],
                                    profileid,
                                    twid,
                                    uid,
                                )
                                self.print(
                                    f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
                                    f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
                                    f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                                    0,
                                    2,
                                )

            except KeyboardInterrupt:
                self.shutdown_gracefully()
                return True
            except Exception as inst:
                self.print('Error in run()')
                self.print(traceback.format_exc(), 0, 1)
                return True