mutationorigin/mutation_origin/util.py at fad00fca3c1073637ede2c6948f5278a030971dc · HuttleyLab/mutationorigin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import re
import sys
from subprocess import Popen, PIPE
import os
import pickle
import json
import pandas
import numpy
from tqdm import tqdm
from cogent3.util.misc import open_, get_format_suffixes
from cogent3 import make_table


__author__ = 'Gavin Huttley'
__copyright__ = 'Copyright 2014, Gavin Huttley'
__credits__ = ['Yicheng Zhu', 'Cheng Soon Ong', 'Gavin Huttley']
__license__ = 'BSD'
__version__ = '0.3'
__maintainer__ = 'Gavin Huttley'
__email__ = 'Gavin.Huttley@anu.edu.au'
__status__ = 'Development'


FILENAME_PATTERNS = {'sample_data': dict(train='train-*.tsv.gz',
                                         test='test-*.tsv.gz'),
                     'train': '*-classifier-*.pkl*',
                     'predict': '*-predicted-*.json.gz',
                     'performance': '*-performance.json.gz'}
MUTATION_DIRECTIONS = ('AtoC', 'AtoG', 'AtoT', 'CtoA', 'CtoG', 'CtoT',
                       'GtoA', 'GtoC', 'GtoT', 'TtoA', 'TtoC', 'TtoG')
BASES = tuple('ACGT')


def exec_command(cmnd, stdout=PIPE, stderr=PIPE):
    '''executes shell command and returns stdout if completes exit code 0

    Parameters
    ----------

    cmnd : str
      shell command to be executed
    stdout, stderr : streams
      Default value (PIPE) intercepts process output, setting to None
      blocks this.'''
    proc = Popen(cmnd, shell=True, stdout=stdout, stderr=stderr)
    out, err = proc.communicate()
    if proc.returncode != 0:
        msg = err
        sys.stderr.writelines('FAILED: %s\n%s' % (cmnd, msg))
        sys.exit(proc.returncode)
    if out is not None:
        r = out.decode('utf8')
    else:
        r = None
    return r


def get_enu_germline_sizes(total, enu_ratio):
    '''returns the enu and germline sample sizes to satisfy enu_ratio'''
    unit = total / (enu_ratio + 1)
    enu = int(unit * enu_ratio)
    germline = total - enu
    return enu, germline


def valid_response_values(data):
    vals = set(data)
    return vals <= {'e', 'g'}


def dump_json(path, data):
    '''dumps data in json format'''
    with open_(path, mode='wt') as outfile:
        json.dump(data, outfile)


def load_json(path):
    '''loads raw data object from json file'''
    with open_(path) as infile:
        data = json.load(infile)
    return data


def load_predictions(path):
    '''returns dataframe, params from  json format prediction data'''
    data = load_json(path)

    params = data['feature_params']
    df = pandas.DataFrame(data['predictions'])
    cpath = data.get('classifier_path', None)
    label = data.get('classifier_label', None)
    return df, params, cpath, label


def load_classifier(path):
    '''returns dict of pickled classifier and features info'''
    with open_(path, 'rb') as clf:
        classifier = pickle.load(clf)
    try:
        feature_params = classifier['feature_params']
        scaler = classifier.get('scaler', None)
        classifier = classifier['classifier']
    except KeyError:
        raise ValueError('pickle formatted file does not '
                         'contain classifier')
    return classifier, feature_params, scaler


def get_basename(path):
    '''returns a file basename without the suffixes'''
    bn = os.path.basename(path)
    suffix, cmp_suffix = get_format_suffixes(bn)
    rindex = bn.rfind(f'.{suffix}')
    return bn[:rindex]


def get_classifier_label(classifier):
    '''returns string label of classifier'''
    name = classifier.__class__.__name__.lower()
    if 'logistic' in name:
        label = 'lr'
    elif 'nb' in name:
        label = 'nb'
    elif 'svm' in name:
        label = 'ocs'
    elif name.startswith('xgb'):
        label = 'xgb'
    else:
        raise ValueError(f'Unknown classifier type {name}')
    return label


def dirname_from_features(features):
    '''generates directory names from a feature set'''
    dirname = f'f{features["flank_size"]}'
    if features.get('feature_dim'):
        dirname += f'd{features["feature_dim"]}'
    if features.get('proximal'):
        dirname += 'p'
    if features.get('usegc'):
        dirname += 'GC'
    return dirname


def flank_dim_combinations(max_flank=4, start_flank=0, flank_sizes=None,
                           get_dims=None):
    '''returns flank_size/dim combinations'''
    combinations = []
    flank_sizes = flank_sizes or range(start_flank, max_flank)
    for fz in flank_sizes:
        if fz == 0:
            combinations.append(dict(flank_size=fz))
            continue

        if get_dims is None:
            dims = range(1, 2 * fz)
        else:
            dims = get_dims(fz)

        for dim in dims:
            combinations.append(dict(flank_size=fz, feature_dim=dim))

    return combinations


_size = re.compile(r'(?<=/)\d+(?=k/)')


def sample_size_from_path(path):
    '''returns component of path ijndicating sample size'''
    try:
        size = int(_size.findall(path)[0]) * 1000
    except IndexError:
        size = None
    return size


_rep = re.compile(r'(?<=-)\d+(?=[-.])')


def data_rep_from_path(src, path):
    '''returns component of path indicating sample size'''
    basename = os.path.basename(path)
    rep = _rep.findall(basename)[0]
    return rep


_feats = re.compile(r'f\d+[d\d]*p*[GC]*')


def feature_set_from_path(path):
    features = _feats.findall(path)
    features = features[-1]
    return features


def model_name_from_features(flank, dim, usegc, proximal):
    '''returns the model name from a feature set'''
    if flank == 0:
        model = ['M']
    elif flank and dim == 1:
        model = ['M', 'I']
    elif flank and dim == 2 * flank:
        assert not proximal
        model = ['FS']
    elif dim > 1:
        prox = 'p' if proximal else ''
        model = ['M', 'I'] + [f'{i}D{prox}' for i in range(2, dim + 1)]
    else:
        raise ValueError('Unexpected model', flank, flank)
    if usegc:
        model.append('GC')

    name = '+'.join(model)
    return name


def summary_stat_table(table, factors):
    '''returns summary statistics for classifier, feature set combination'''
    fscore_cols = [c for c in table.header if c.startswith('fscore')]
    distinct = table.distinct_values(factors)
    rows = []
    for comb in tqdm(distinct, ncols=80):
        subtable = table.filtered(lambda x: tuple(x) == tuple(comb),
                                  columns=factors)
        aurocs = numpy.array(subtable.tolist('auc'))
        mean_prec = numpy.array(subtable.tolist('mean_precision'))
        accuracy = numpy.array(subtable.tolist('balanced_accuracy'))
        row = list(comb) + [aurocs.mean(), aurocs.std(ddof=1),
                            mean_prec.mean(), mean_prec.std(ddof=1),
                            accuracy.mean(), accuracy.std(ddof=1)]
        for col in fscore_cols:
            data = numpy.array(subtable.tolist(col))
            row.append(data.mean())
            row.append(data.std(ddof=1))
        rows.append(row)

    header = list(factors) + ['mean_auc', 'std_auc', 'mean_ap', 'std_ap',
                              'mean_balanced_accuracy',
                              'std_balanced_accuracy']
    for col in fscore_cols:
        header.extend([f'mean_{col}', f'std_{col}'])

    table = make_table(header=header, data=rows)
    table = table.sorted(reverse='mean_auc')
    return table


def iter_indices(total, block_size):
    '''yields a block_size numpy array of indices up to total'''
    for start in range(0, total, block_size):
        end = min(total, start + block_size)
        indices = range(start, end)
        yield list(indices)


def skip_path(exclude_paths, path):
    '''returns True if path has a match to exclude paths'''
    result = False
    for ex in exclude_paths:
        if ex in path:
            result = True
            break
    return result