-
Notifications
You must be signed in to change notification settings - Fork 1k
Expand file tree
/
Copy pathclassify.py
More file actions
78 lines (62 loc) · 2.55 KB
/
classify.py
File metadata and controls
78 lines (62 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from __future__ import print_function
import numpy
from sklearn.metrics import f1_score, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
class TopKRanker(OneVsRestClassifier):
def predict(self, X, top_k_list):
probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
all_labels = []
for i, k in enumerate(top_k_list):
probs_ = probs[i, :]
labels = self.classes_[probs_.argsort()[-k:]].tolist()
probs_[:] = 0
probs_[labels] = 1
all_labels.append(probs_)
return numpy.asarray(all_labels)
class Classifier(object):
def __init__(self, embeddings, clf):
self.embeddings = embeddings
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
def train(self, X, Y, Y_all):
self.binarizer.fit(Y_all)
X_train = [self.embeddings[x] for x in X]
Y = self.binarizer.transform(Y)
self.clf.fit(X_train, Y)
def evaluate(self, X, Y):
top_k_list = [len(l) for l in Y]
Y_ = self.predict(X, top_k_list)
Y = self.binarizer.transform(Y)
averages = ["micro", "macro", "samples", "weighted"]
results = {}
for average in averages:
results[average] = f1_score(Y, Y_, average=average)
results['acc'] = accuracy_score(Y,Y_)
print('-------------------')
print(results)
return results
print('-------------------')
def predict(self, X, top_k_list):
X_ = numpy.asarray([self.embeddings[x] for x in X])
Y = self.clf.predict(X_, top_k_list=top_k_list)
return Y
def split_train_evaluate(self, X, Y, train_precent, seed=0):
state = numpy.random.get_state()
training_size = int(train_precent * len(X))
numpy.random.seed(seed)
shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
X_train = [X[shuffle_indices[i]] for i in range(training_size)]
Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
self.train(X_train, Y_train, Y)
numpy.random.set_state(state)
return self.evaluate(X_test, Y_test)
def read_node_label(filename, skip_head=False):
fin = pd.read_csv(filename)
cols=fin.columns
X = fin[cols[0]]
Y = fin[cols[1]]
return X, Y