-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
98 lines (82 loc) · 3.28 KB
/
utils.py
File metadata and controls
98 lines (82 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import matthews_corrcoef
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
def build_dataloader(df: pd.DataFrame, embed_path: Path, labels: str | list[str] = "label"):
"""
Build a DataLoader for the given DataFrame and embedding path.
:param df: DataFrame containing the data.
:param embed_path: Path to the directory containing the embeddings.
:param dataloader_kwargs: Additional arguments for DataLoader.
:return: DataLoader for the embeddings and targets.
"""
embed_path = Path(embed_path)
embeddings = []
valid_ids = set()
for idx in df["ID"].values:
try:
with open(embed_path / f"{idx}.pkl", "rb") as f:
tmp = pickle.load(f)
if not isinstance(tmp, np.ndarray):
tmp = tmp.cpu().numpy()
embeddings.append(tmp)
valid_ids.add(idx)
except Exception:
pass
inputs = np.stack(embeddings)
targets = np.array(df[df["ID"].isin(valid_ids)][labels].values).astype(np.float32)
# Shuffle the inputs and targets
permut = np.random.permutation(inputs.shape[0])
inputs = inputs[permut]
targets = targets[permut]
return inputs, targets
def multioutput_mcc(y_true, y_pred):
"""
Compute the average Matthews Correlation Coefficient (MCC) for a multi-output task.
Parameters:
- y_true: np.ndarray of shape (n_samples, n_outputs)
- y_pred: np.ndarray of shape (n_samples, n_outputs)
Returns:
- float: average MCC across outputs
"""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
assert y_true.shape == y_pred.shape, "Shapes of y_true and y_pred must match"
mccs = []
for i in range(y_true.shape[1]):
try:
mcc = matthews_corrcoef(y_true[:, i], y_pred[:, i])
except ValueError:
# Handle cases where MCC is undefined (e.g., only one class present)
mcc = 0.0
mccs.append(mcc)
return np.mean(mccs)
def fit_model(task, algo, trainX, trainY, binary: bool = False) -> sklearn.base.BaseEstimator:
if task == "regression":
if algo == "lr":
return LinearRegression().fit(trainX, trainY)
elif algo == "xgb":
return XGBRegressor(
tree_method="hist",
n_estimators=50,
max_depth=20,
random_state=42,
device="cpu",
).fit(trainX, trainY)
elif algo == "knn":
return KNeighborsRegressor(n_neighbors=5, weights="distance", algorithm="brute", metric="cosine").fit(trainX, trainY)
else:
if algo == "lr":
if binary:
return LogisticRegression().fit(trainX, trainY)
else:
return MultiOutputClassifier(LogisticRegression()).fit(trainX, trainY)
elif algo == "knn":
return KNeighborsRegressor(n_neighbors=5, weights="distance", algorithm="brute", metric="cosine").fit(trainX, trainY)
raise ValueError(f"Unknown task: {task} or algorithm: {algo}")